Si importano i sette datset
#importazione del primo dataset e statistiche descrittive
raw1 <- read_csv2(file.path(directory, "raw_1_cli_fid.csv"),
na = c("NA", ""))
knitr::kable(head(raw1))
| ID_CLI | ID_FID | ID_NEG | TYP_CLI_FID | COD_FID | STATUS_FID | DT_ACTIVE |
|---|---|---|---|---|---|---|
| 500 | 814583 | 32 | 1 | PREMIUM | 1 | 2019-02-23 |
| 16647 | 781106 | 44 | 1 | PREMIUM | 1 | 2019-02-02 |
| 835335 | 816369 | 28 | 1 | PREMIUM | 1 | 2019-02-23 |
| 9557 | 746573 | 9 | 1 | PREMIUM | 1 | 2019-01-11 |
| 767877 | 741522 | 41 | 1 | PREMIUM | 1 | 2019-01-07 |
| 743090 | 776971 | 2 | 1 | PREMIUM | 1 | 2019-01-30 |
summary(raw1)
ID_CLI ID_FID ID_NEG TYP_CLI_FID
Min. : 1 Min. : 3 Min. : 1.0 Min. :0.0000
1st Qu.:230659 1st Qu.:229067 1st Qu.: 6.0 1st Qu.:1.0000
Median :462034 Median :458969 Median :23.0 Median :1.0000
Mean :462486 Mean :459425 Mean :22.1 Mean :0.9848
3rd Qu.:693200 3rd Qu.:688435 3rd Qu.:36.0 3rd Qu.:1.0000
Max. :934919 Max. :928121 Max. :49.0 Max. :1.0000
COD_FID STATUS_FID DT_ACTIVE
Length:370135 Min. :0.00 Min. :2018-01-01
Class :character 1st Qu.:1.00 1st Qu.:2018-04-15
Mode :character Median :1.00 Median :2018-08-10
Mean :0.99 Mean :2018-08-14
3rd Qu.:1.00 3rd Qu.:2018-11-30
Max. :1.00 Max. :2019-05-11
#importazione del secondo dataset e statistiche descrittive
raw2 <- read_csv2(file.path(directory, "raw_2_cli_account.csv"),
na = c("NA", ""))
knitr::kable(head(raw2))
| ID_CLI | EMAIL_PROVIDER | W_PHONE | ID_ADDRESS | TYP_CLI_ACCOUNT | TYP_JOB |
|---|---|---|---|---|---|
| 600125 | libero.it | NA | 584621 | 4 | NA |
| 729642 | gmail.com | NA | 714144 | 4 | NA |
| 304639 | yahoo.it | 1 | 284176 | 4 | NA |
| 292497 | libero.it | 1 | 272563 | 4 | NA |
| 589492 | gmail.com | NA | 573304 | 2 | NA |
| 638815 | oulook.it | 1 | 622947 | 2 | NA |
summary(raw2)
## ID_CLI EMAIL_PROVIDER W_PHONE ID_ADDRESS
## Min. : 1 Length:369472 Min. :1 Min. : 1
## 1st Qu.:230783 Class :character 1st Qu.:1 1st Qu.:227903
## Median :462063 Mode :character Median :1 Median :456720
## Mean :462541 Mean :1 Mean :457283
## 3rd Qu.:693197 3rd Qu.:1 3rd Qu.:686533
## Max. :934919 Max. :1 Max. :900091
## NA's :27305
## TYP_CLI_ACCOUNT TYP_JOB
## Min. :2.000 Length:369472
## 1st Qu.:4.000 Class :character
## Median :4.000 Mode :character
## Mean :3.806
## 3rd Qu.:4.000
## Max. :4.000
##
#importazione del terzo dataset e statistiche descrittive
raw3 <- read_csv2(file.path(directory, "raw_3_cli_address.csv"),
na = c(""))
knitr::kable(head(raw3))
| ID_ADDRESS | CAP | PRV | REGION |
|---|---|---|---|
| 1337 | 20083 | MI | LOMBARDIA |
| 1344 | 20024 | MI | LOMBARDIA |
| 1347 | 20090 | MI | LOMBARDIA |
| 1347 | 20090 | MI | LOMBARDIA |
| 1347 | 20090 | MI | LOMBARDIA |
| 1347 | 20090 | MI | LOMBARDIA |
summary(raw3)
## ID_ADDRESS CAP PRV REGION
## Min. : 1 Length:1211332 Length:1211332 Length:1211332
## 1st Qu.:221063 Class :character Class :character Class :character
## Median :437083 Mode :character Mode :character Mode :character
## Mean :443391
## 3rd Qu.:664931
## Max. :900090
#importazione del quarto dataset e statistiche descrittive
raw4 <- read_csv2(file.path(directory, "raw_4_cli_privacy.csv"),
na = c("NA", ""))
knitr::kable(head(raw4))
| ID_CLI | FLAG_PRIVACY_1 | FLAG_PRIVACY_2 | FLAG_DIRECT_MKT |
|---|---|---|---|
| 4691 | 1 | 1 | 1 |
| 3434 | 0 | 1 | 0 |
| 3533 | 1 | 1 | 1 |
| 9866 | 1 | 1 | 1 |
| 5799 | 1 | 1 | 1 |
| 4660 | 0 | 1 | 0 |
summary(raw4)
## ID_CLI FLAG_PRIVACY_1 FLAG_PRIVACY_2 FLAG_DIRECT_MKT
## Min. : 1 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:230783 1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.:0.0000
## Median :462063 Median :1.0000 Median :1.0000 Median :1.0000
## Mean :462541 Mean :0.6557 Mean :0.9356 Mean :0.6707
## 3rd Qu.:693197 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :934919 Max. :1.0000 Max. :1.0000 Max. :1.0000
#importazione del quinto dataset e statistiche descrittive
raw5 <- read_csv2(file.path(directory, "raw_5_camp_cat.csv"),
na = c("NA", ""))
knitr::kable(head(raw5))
| ID_CAMP | TYP_CAMP | CHANNEL_CAMP |
|---|---|---|
| 757 | PERSONALIZED | |
| 759 | PERSONALIZED | |
| 760 | PERSONALIZED | |
| 761 | PERSONALIZED | |
| 762 | PERSONALIZED | |
| 763 | PERSONALIZED |
summary(raw5)
## ID_CAMP TYP_CAMP CHANNEL_CAMP
## Min. : 5.0 Length:848 Length:848
## 1st Qu.: 327.8 Class :character Class :character
## Median : 561.5 Mode :character Mode :character
## Mean : 559.6
## 3rd Qu.: 812.2
## Max. :1052.0
#importazione del sesto dataset e statistiche descrittive
raw6 <- read_csv2(file.path(directory, "raw_6_camp_event.csv"),
na = c("NA", ""))
knitr::kable(head(raw6))
| ID_EVENT | ID_CLI | ID_CAMP | ID_DELIVERY | TYP_EVENT | EVENT_DATE |
|---|---|---|---|---|---|
| 11548588 | 411925 | 948 | 8996 | V | 2019-01-01 01:42:24 |
| 11548640 | 313259 | 949 | 8997 | V | 2019-01-01 02:54:04 |
| 11548572 | 327299 | 941 | 8817 | V | 2019-01-01 01:23:29 |
| 11548515 | 627427 | 923 | 8263 | V | 2019-01-01 00:43:06 |
| 11548609 | 265258 | 950 | 8998 | V | 2019-01-01 02:11:03 |
| 11548497 | 693938 | 955 | 9115 | V | 2019-01-01 00:29:04 |
summary(raw6)
## ID_EVENT ID_CLI ID_CAMP ID_DELIVERY
## Min. :11548441 Min. : 1 Min. : 148.0 Min. : 7680
## 1st Qu.:13526812 1st Qu.:208129 1st Qu.: 970.0 1st Qu.:10066
## Median :14867410 Median :398084 Median : 991.0 Median :10585
## Mean :14757544 Mean :413793 Mean : 928.9 Mean :10534
## 3rd Qu.:16284762 3rd Qu.:618440 3rd Qu.:1024.0 3rd Qu.:11151
## Max. :17650340 Max. :931973 Max. :1048.0 Max. :11509
## TYP_EVENT EVENT_DATE
## Length:2060360 Min. :2019-01-01 00:00:46
## Class :character 1st Qu.:2019-02-13 13:01:06
## Mode :character Median :2019-02-28 14:01:27
## Mean :2019-03-04 14:23:02
## 3rd Qu.:2019-04-04 17:52:18
## Max. :2019-04-30 23:58:41
#importazione del settimo dataset e statistiche descrittive
raw7 <- read_csv2(file.path(directory, "raw_7_tic.csv"),
na = c("NA", ""))
knitr::kable(head(raw7))
| ID_SCONTRINO | ID_CLI | ID_NEG | ID_ARTICOLO | COD_REPARTO | DIREZIONE | IMPORTO_LORDO | SCONTO | DATETIME |
|---|---|---|---|---|---|---|---|---|
| 51299709701/05/18718488513770 | 318714 | 48 | 34216854 | 2 | 1 | 22.80 | 0.05 | 2018-05-01 07:18:48 |
| 51299709701/05/18718488513770 | 318714 | 48 | 34216854 | 2 | 1 | 22.75 | 0.00 | 2018-05-01 07:18:48 |
| 51399406901/05/18718488528430 | 324419 | 47 | 36167733 | 9 | 1 | 1290.00 | 491.00 | 2018-05-01 07:18:48 |
| 511499219301/05/18718488544998 | 332611 | 36 | 32404540 | 13 | 1 | 134.00 | 34.10 | 2018-05-01 07:18:48 |
| 511599623601/05/18718498530796 | 325589 | 35 | 35811412 | 9 | 1 | 399.00 | 0.00 | 2018-05-01 07:18:49 |
| 511599623601/05/18718498530796 | 325589 | 35 | 35811741 | 9 | 1 | 474.00 | 0.00 | 2018-05-01 07:18:49 |
summary(raw7)
## ID_SCONTRINO ID_CLI ID_NEG ID_ARTICOLO
## Length:4263220 Min. : 5 Min. : 2.00 Min. : 16921
## Class :character 1st Qu.:211201 1st Qu.:14.00 1st Qu.:34141800
## Mode :character Median :411821 Median :27.00 Median :35558446
## Mean :411433 Mean :25.93 Mean :38022591
## 3rd Qu.:601672 3rd Qu.:38.00 3rd Qu.:36144465
## Max. :934729 Max. :49.00 Max. :82231481
## COD_REPARTO DIREZIONE IMPORTO_LORDO SCONTO
## Min. : 1.000 Min. :-1.0000 Min. :-17465.00 Min. :-2472.000
## 1st Qu.: 3.000 1st Qu.: 1.0000 1st Qu.: 2.67 1st Qu.: 0.000
## Median : 7.000 Median : 1.0000 Median : 7.50 Median : 0.000
## Mean : 7.393 Mean : 0.8866 Mean : 32.65 Mean : 2.329
## 3rd Qu.:11.000 3rd Qu.: 1.0000 3rd Qu.: 22.00 3rd Qu.: 0.000
## Max. :14.000 Max. : 1.0000 Max. : 17465.00 Max. : 4853.590
## DATETIME
## Min. :2018-05-01 07:18:48
## 1st Qu.:2018-08-12 14:05:04
## Median :2018-11-09 16:37:45
## Mean :2018-11-06 03:52:05
## 3rd Qu.:2019-02-02 12:49:36
## Max. :2019-04-30 21:57:25
Si puliscono i dataset e si correggono gli eventuali errori o imprecisioni per favorire un’analisi piĂ¹ accurata ed aderente alla realtĂ
#Creo copie dei datset per avere un backup degli originali
data1 <- raw1
data2 <- raw2
data3 <- raw3
data4 <- raw4
data5 <- raw5
data6 <- raw6
data7 <- raw7
#ricerca di duplicati
data1_dup <- data1 %>%
summarise(TOT_ID_CLIs = n_distinct(ID_CLI),
TOT_ID_FIDs = n_distinct(ID_FID),
TOT_ID_CLIFIDs = n_distinct(paste0(as.character(ID_CLI), "-",
as.character(ID_FID))),
TOT_ROWs = n())
knitr::kable(data1_dup)
| TOT_ID_CLIs | TOT_ID_FIDs | TOT_ID_CLIFIDs | TOT_ROWs |
|---|---|---|---|
| 369472 | 367925 | 370135 | 370135 |
#formattazzione della data e fattorizzazione degli attributi che lo richiedono
data1 <- data1 %>%
mutate(DT_ACTIVE = as.Date(DT_ACTIVE)) %>%
mutate(TYP_CLI_FID = as.factor(TYP_CLI_FID)) %>%
mutate(STATUS_FID = as.factor(STATUS_FID))
#conteggio numero abbonamenti per cliente
fid_sub_cli <- data1 %>%
group_by(ID_CLI) %>%
summarise(NUM_FIDs = n_distinct(ID_FID),
NUM_DATEs = n_distinct(DT_ACTIVE))
#conteggio percentuale numero carte fedeltĂ per cliente
tot_id_cli <- n_distinct(fid_sub_cli$ID_CLI)
dist_fid_sub_cli <- fid_sub_cli %>%
group_by(NUM_FIDs, NUM_DATEs) %>%
summarise(TOT_CLIs = n_distinct(ID_CLI)) %>%
mutate(PERCENT_CLIs = TOT_CLIs/tot_id_cli)
knitr::kable(dist_fid_sub_cli)
| NUM_FIDs | NUM_DATEs | TOT_CLIs | PERCENT_CLIs |
|---|---|---|---|
| 1 | 1 | 368833 | 0.9982705 |
| 2 | 1 | 254 | 0.0006875 |
| 2 | 2 | 363 | 0.0009825 |
| 3 | 1 | 7 | 0.0000189 |
| 3 | 2 | 8 | 0.0000217 |
| 3 | 3 | 5 | 0.0000135 |
| 4 | 1 | 2 | 0.0000054 |
#clienti con 3 abbonamenti con esempi di ricerca su singoli clienti
fid_sub_cli %>%
filter(NUM_FIDs == 3)
## # A tibble: 20 x 3
## ID_CLI NUM_FIDs NUM_DATEs
## <dbl> <int> <int>
## 1 7533 3 3
## 2 11477 3 1
## 3 68556 3 1
## 4 96537 3 1
## 5 223203 3 3
## 6 250133 3 2
## 7 311669 3 1
## 8 320880 3 2
## 9 621814 3 3
## 10 648813 3 3
## 11 662651 3 3
## 12 671097 3 2
## 13 688941 3 2
## 14 722274 3 2
## 15 736520 3 2
## 16 757759 3 2
## 17 792369 3 1
## 18 826640 3 2
## 19 859327 3 1
## 20 879261 3 1
data1 %>%
filter(ID_CLI == 96537)
## # A tibble: 3 x 7
## ID_CLI ID_FID ID_NEG TYP_CLI_FID COD_FID STATUS_FID DT_ACTIVE
## <dbl> <dbl> <dbl> <fct> <chr> <fct> <date>
## 1 96537 226059 21 1 PREMIUM 0 2018-04-14
## 2 96537 226128 21 1 PREMIUM 0 2018-04-14
## 3 96537 226156 21 1 STANDARD 1 2018-04-14
data1 %>%
filter(ID_CLI == 250133)
## # A tibble: 3 x 7
## ID_CLI ID_FID ID_NEG TYP_CLI_FID COD_FID STATUS_FID DT_ACTIVE
## <dbl> <dbl> <dbl> <fct> <chr> <fct> <date>
## 1 250133 158212 11 1 STANDARD 0 2018-03-15
## 2 250133 309538 1 1 PREMIUM 0 2018-05-22
## 3 250133 310499 1 1 PREMIUM 0 2018-05-22
#combinazione informazioni per ogni cliente dal primo abbonamento all'ultimo
client_data1_first <- data1 %>%
group_by(ID_CLI) %>%
filter(DT_ACTIVE == min(DT_ACTIVE)) %>%
arrange(ID_FID) %>%
filter(row_number() == 1) %>%
ungroup() %>%
as.data.frame()
client_data1_last <- data1 %>%
group_by(ID_CLI) %>%
filter(DT_ACTIVE == max(DT_ACTIVE)) %>%
arrange(desc(ID_FID)) %>%
filter(row_number() == 1) %>%
ungroup() %>%
as.data.frame()
# combinazione tramite left join tra `client_data1_first` e `fid_sub_cli` in `client_data1_last`
data1_complete <- client_data1_last %>%
select(ID_CLI,
ID_FID,
LAST_COD_FID = COD_FID,
LAST_TYP_CLI_FID = TYP_CLI_FID,
LAST_STATUS_FID = STATUS_FID,
LAST_DT_ACTIVE = DT_ACTIVE) %>%
left_join(client_data1_first %>%
select(ID_CLI,
FIRST_ID_NEG = ID_NEG,
FIRST_DT_ACTIVE = DT_ACTIVE),
by = "ID_CLI") %>%
left_join(fid_sub_cli %>%
select(ID_CLI,
NUM_FIDs) %>%
mutate(NUM_FIDs = as.factor(NUM_FIDs)),
by = "ID_CLI")
#analisi distribuzione programma fedeltĂ tramite grafico
data1_fid <- data1_complete %>%
group_by(LAST_COD_FID) %>%
summarise(TOT_CLIs = n_distinct(ID_CLI)) %>%
mutate(PERCENT = TOT_CLIs/sum(TOT_CLIs)) %>%
arrange(desc(PERCENT))
knitr::kable(data1_fid)
| LAST_COD_FID | TOT_CLIs | PERCENT |
|---|---|---|
| STANDARD | 289756 | 0.7842435 |
| PREMIUM | 43878 | 0.1187587 |
| STANDARD BIZ | 29148 | 0.0788910 |
| PREMIUM BIZ | 6690 | 0.0181069 |
ggplot(data = data1_fid,
aes(x = LAST_COD_FID, y = TOT_CLIs)) +
geom_bar(stat = "identity",
fill = "steelblue") +
theme_minimal()
#analisi distribuzione variabile `LAST_STATUS_FID` mediante grafico
data1_laststatus <- data1_complete %>%
group_by(LAST_STATUS_FID) %>%
summarise(TOT_CLIs = n_distinct(ID_CLI)) %>%
mutate(PERCENT = TOT_CLIs/sum(TOT_CLIs)) %>%
arrange(desc(PERCENT))
knitr::kable(data1_laststatus)
| LAST_STATUS_FID | TOT_CLIs | PERCENT |
|---|---|---|
| 1 | 366413 | 0.9917206 |
| 0 | 3059 | 0.0082794 |
ggplot(data = data1_laststatus,
aes(x = LAST_STATUS_FID, y = TOT_CLIs)) +
geom_bar(stat = "identity",
fill = "steelblue") +
theme_minimal()
#analisi variabile `NUM_FIDs` mediante grafico
data1_numfid <- data1_complete %>%
group_by(NUM_FIDs) %>%
summarise(TOT_CLIs = n_distinct(ID_CLI)) %>%
mutate(PERCENT = TOT_CLIs/sum(TOT_CLIs)) %>%
arrange(desc(PERCENT))
knitr::kable(data1_numfid)
| NUM_FIDs | TOT_CLIs | PERCENT |
|---|---|---|
| 1 | 368833 | 0.9982705 |
| 2 | 617 | 0.0016700 |
| 3 | 20 | 0.0000541 |
| 4 | 2 | 0.0000054 |
ggplot(data = data1_numfid,
aes(x = NUM_FIDs, y = TOT_CLIs)) +
geom_bar(stat = "identity",
fill = "steelblue") +
theme_minimal()
#analisi variabile `LAST_TYP_CLI_FID` mediante grafico
data1_type <- data1_complete %>%
group_by(LAST_TYP_CLI_FID) %>%
summarise(TOT_CLIs = n_distinct(ID_CLI)) %>%
mutate(PERCENT = TOT_CLIs/sum(TOT_CLIs)) %>%
arrange(desc(PERCENT))
knitr::kable(data1_type)
| LAST_TYP_CLI_FID | TOT_CLIs | PERCENT |
|---|---|---|
| 1 | 363865 | 0.9848243 |
| 0 | 5607 | 0.0151757 |
ggplot(data = data1_type,
aes(x = LAST_TYP_CLI_FID, y = TOT_CLIs)) +
geom_bar(stat = "identity",
fill = "steelblue") +
theme_minimal()
#analisi variabile `FIRST_DT_ACTIVE` mediante grafico
data1_date <- data1_complete %>%
group_by(FIRST_DT_ACTIVE) %>%
summarise(TOT_CLIs = n_distinct(ID_CLI)) %>%
mutate(PERCENT = TOT_CLIs/sum(TOT_CLIs)) %>%
arrange(desc(PERCENT))
knitr::kable(head(data1_date))
| FIRST_DT_ACTIVE | TOT_CLIs | PERCENT |
|---|---|---|
| 2018-11-23 | 3021 | 0.0081765 |
| 2018-04-07 | 1457 | 0.0039435 |
| 2018-11-22 | 1438 | 0.0038920 |
| 2018-03-11 | 1437 | 0.0038893 |
| 2018-04-28 | 1400 | 0.0037892 |
| 2018-04-14 | 1399 | 0.0037865 |
ggplot(data = data1_date,
aes(x = FIRST_DT_ACTIVE, y = TOT_CLIs)) +
geom_line() +
theme_minimal()
#controllo duplicati che risultano non esserci
data2_dup <- data2 %>%
summarise(TOT_ID_CLIs = n_distinct(ID_CLI),
TOT_ROWs = n())
knitr::kable(data2_dup)
| TOT_ID_CLIs | TOT_ROWs |
|---|---|
| 369472 | 369472 |
#fattorizzazione delle variabili che lo richiedono e gestione dei valori mancanti
data2_complete <- data2 %>%
mutate(W_PHONE = as.factor(W_PHONE)) %>%
mutate(TYP_CLI_ACCOUNT = as.factor(TYP_CLI_ACCOUNT)) %>%
mutate(W_PHONE = fct_explicit_na(W_PHONE, "0")) %>%
mutate(EMAIL_PROVIDER = fct_explicit_na(EMAIL_PROVIDER,
"(missing)")) %>%
mutate(TYP_JOB = fct_explicit_na(TYP_JOB, "(missing)"))
#analisi distribuzione variabile `EMAIL_PROVIDER`
data2_emailprovider <- data2_complete %>%
group_by(EMAIL_PROVIDER) %>%
summarise(TOT_CLIs = n_distinct(ID_CLI)) %>%
mutate(PERCENT = TOT_CLIs/sum(TOT_CLIs)) %>%
arrange(desc(PERCENT)) %>%
as.data.frame()
knitr::kable(head(data2_emailprovider))
| EMAIL_PROVIDER | TOT_CLIs | PERCENT |
|---|---|---|
| gmail.com | 151508 | 0.4100663 |
| libero.it | 57782 | 0.1563907 |
| hotmail.it | 28698 | 0.0776730 |
| alice.it | 18127 | 0.0490619 |
| yahoo.it | 16538 | 0.0447612 |
| hotmail.com | 10076 | 0.0272713 |
#fattorizzazione della variabile `EMAIL_PROVIDER` considerati i molti livelli in cui la frequenza è molto vicina allo zero. Si decide di accorpare qualsiasi livello che presenti una frequenza percentuale inferiore a 0.85 nel livello denominato `others`
knitr::kable(data2_emailprovider %>%
arrange(desc(PERCENT)) %>%
mutate(PERCENT_COVERED = cumsum(TOT_CLIs)/sum(TOT_CLIs)) %>%
as.data.frame() %>%
head(10))
| EMAIL_PROVIDER | TOT_CLIs | PERCENT | PERCENT_COVERED |
|---|---|---|---|
| gmail.com | 151508 | 0.4100663 | 0.4100663 |
| libero.it | 57782 | 0.1563907 | 0.5664570 |
| hotmail.it | 28698 | 0.0776730 | 0.6441300 |
| alice.it | 18127 | 0.0490619 | 0.6931919 |
| yahoo.it | 16538 | 0.0447612 | 0.7379531 |
| hotmail.com | 10076 | 0.0272713 | 0.7652244 |
| virgilio.it | 9161 | 0.0247948 | 0.7900193 |
| tiscali.it | 8733 | 0.0236364 | 0.8136557 |
| live.it | 7936 | 0.0214793 | 0.8351350 |
| (missing) | 5889 | 0.0159390 | 0.8510740 |
emailprovider <- data2_emailprovider %>%
arrange(desc(PERCENT)) %>%
mutate(PERCENT_COVERED = cumsum(TOT_CLIs)/sum(TOT_CLIs)) %>%
mutate(EMAIL_PROVIDER = as.character(EMAIL_PROVIDER)) %>%
mutate(AUX = if_else(PERCENT_COVERED < 0.85 |
(PERCENT_COVERED > 0.85 &
lag(PERCENT_COVERED) < 0.85), 1,0)) %>%
mutate(EMAIL_PROVIDER_CLEAN = if_else(AUX | EMAIL_PROVIDER == "(missing)",
EMAIL_PROVIDER,
"others"))
knitr::kable(head(emailprovider, 7))
| EMAIL_PROVIDER | TOT_CLIs | PERCENT | PERCENT_COVERED | AUX | EMAIL_PROVIDER_CLEAN |
|---|---|---|---|---|---|
| gmail.com | 151508 | 0.4100663 | 0.4100663 | 1 | gmail.com |
| libero.it | 57782 | 0.1563907 | 0.5664570 | 1 | libero.it |
| hotmail.it | 28698 | 0.0776730 | 0.6441300 | 1 | hotmail.it |
| alice.it | 18127 | 0.0490619 | 0.6931919 | 1 | alice.it |
| yahoo.it | 16538 | 0.0447612 | 0.7379531 | 1 | yahoo.it |
| hotmail.com | 10076 | 0.0272713 | 0.7652244 | 1 | hotmail.com |
| virgilio.it | 9161 | 0.0247948 | 0.7900193 | 1 | virgilio.it |
data2_complete <- data2_complete %>%
mutate(EMAIL_PROVIDER = as.character(EMAIL_PROVIDER)) %>%
left_join(emailprovider %>%
select(EMAIL_PROVIDER, EMAIL_PROVIDER_CLEAN),
by = "EMAIL_PROVIDER") %>%
select(-EMAIL_PROVIDER) %>%
mutate(EMAIL_PROVIDER_CLEAN = as.factor(EMAIL_PROVIDER_CLEAN))
#analisi distribuzionen variabile `W_PHONE` mediante grafico
data2_phone <- data2_complete %>%
group_by(W_PHONE) %>%
summarise(TOT_CLIs = n_distinct(ID_CLI)) %>%
mutate(PERCENT = TOT_CLIs/sum(TOT_CLIs)) %>%
arrange(desc(PERCENT)) %>%
as.data.frame()
knitr::kable(data2_phone)
| W_PHONE | TOT_CLIs | PERCENT |
|---|---|---|
| 1 | 342167 | 0.9260972 |
| 0 | 27305 | 0.0739028 |
ggplot(data = data2_phone,
aes(x = W_PHONE,
y = TOT_CLIs)) +
geom_bar(stat = "identity",
fill = "steelblue") +
theme_minimal()
#analisi distribuzione variabile `TYP_CLI_ACCOUNT`
data2_account <- data2_complete %>%
group_by(TYP_CLI_ACCOUNT) %>%
summarise(TOT_CLIs = n_distinct(ID_CLI)) %>%
mutate(PERCENT = TOT_CLIs/sum(TOT_CLIs)) %>%
arrange(desc(PERCENT)) %>%
as.data.frame()
knitr::kable(data2_account)
| TYP_CLI_ACCOUNT | TOT_CLIs | PERCENT |
|---|---|---|
| 4 | 333656 | 0.9030617 |
| 2 | 35816 | 0.0969383 |
#analisi distribuzione variabile `EMAIL_PROVIDER_CLEAN` tramite grafico
data2_emailproviderclean <- data2_complete %>%
group_by(EMAIL_PROVIDER_CLEAN) %>%
summarise(TOT_CLIs = n_distinct(ID_CLI)) %>%
mutate(PERCENT = TOT_CLIs/sum(TOT_CLIs)) %>%
arrange(desc(PERCENT))
ggplot(data = data2_emailproviderclean,
aes(x = EMAIL_PROVIDER_CLEAN,
y = TOT_CLIs)) +
geom_bar(stat = "identity",
fill = "steelblue") +
theme_minimal()
#analisi distribuzione variabile `TYP_JOB` mediante grafico
data2_job<- data2_complete %>%
group_by(TYP_JOB) %>%
summarise(TOT_CLIs = n_distinct(ID_CLI)) %>%
mutate(PERCENT = TOT_CLIs/sum(TOT_CLIs)) %>%
arrange(desc(PERCENT)) %>%
as.data.frame()
knitr::kable(data2_job)
| TYP_JOB | TOT_CLIs | PERCENT |
|---|---|---|
| (missing) | 360810 | 0.9765557 |
| Libero professionista | 3970 | 0.0107451 |
| Impiegato/a | 1560 | 0.0042222 |
| Altro | 784 | 0.0021219 |
| Pensionato/a | 641 | 0.0017349 |
| Operaio/a | 482 | 0.0013046 |
| Dirigente/Quadro/Funzionario | 354 | 0.0009581 |
| Non Dichiara | 218 | 0.0005900 |
| Casalinga | 191 | 0.0005170 |
| Artigiano | 131 | 0.0003546 |
| Imprenditore | 108 | 0.0002923 |
| In cerca di occupazione | 79 | 0.0002138 |
| Commerciante/Esercente | 71 | 0.0001922 |
| Studente | 54 | 0.0001462 |
| Rappresentante/Agente Commerciale | 19 | 0.0000514 |
ggplot(data = data2_job,
aes(x = TYP_JOB,
y = TOT_CLIs)) +
geom_bar(stat = "identity",
fill = "steelblue") +
theme_minimal()
#analisi distribuzione variabile `TYP_CLI_ACCOUNT`
data2_account <- data2_complete %>%
group_by(TYP_CLI_ACCOUNT) %>%
summarise(TOT_CLIs = n_distinct(ID_CLI)) %>%
mutate(PERCENT = TOT_CLIs/sum(TOT_CLIs)) %>%
arrange(desc(PERCENT)) %>%
as.data.frame()
knitr::kable(data2_account)
| TYP_CLI_ACCOUNT | TOT_CLIs | PERCENT |
|---|---|---|
| 4 | 333656 | 0.9030617 |
| 2 | 35816 | 0.0969383 |
#controllo sui duplicati e rimozione di quelli trovati
knitr::kable(data3 %>%
summarise(TOT_ID_ADDRESSes = n_distinct(ID_ADDRESS),
TOT_ROWs = n()))
| TOT_ID_ADDRESSes | TOT_ROWs |
|---|---|
| 361330 | 1211332 |
data3_complete <- data3 %>%
distinct()
#fattorizzazione della variabile `CAP` e rimozione delle righe che presentano un valore mancante su almeno uno dei seguenti attributi: (`CAP`), (`PRV`) e (`REGION`)
data3_complete <- data3_complete %>%
mutate(CAP = as.character(CAP)) %>%
filter(!is.na(CAP) & !is.na(PRV) & !is.na(REGION))
#analisi della variabile `CAP` in cui sono presenti 4784 differenti valori
nrow(data3_complete %>%
distinct(CAP))
## [1] 4784
#data l'elevata numerositĂ di valori unici nella variabile `CAP` si decide di analizzare la variabile `PRV` che presenta 110 differenti valori
nrow(data3_complete %>%
distinct(PRV))
## [1] 110
#si ritiene troppo elevata anche la numerositĂ di valori unici della variabile `PRV`, si decide di concentrarsi sulla variabile `REGION` che presente 20 valori differenti. Si analizza la distribuzione della variabile `REGION` mediante grafico
nrow(data3_complete %>%
distinct(REGION))
## [1] 20
data3_region<- data3_complete %>%
group_by(REGION) %>%
summarise(TOT_ADDRESS = n_distinct(ID_ADDRESS)) %>%
mutate(PERCENT = TOT_ADDRESS/sum(TOT_ADDRESS)) %>%
arrange(desc(PERCENT))
knitr::kable(head(data3_region))
| REGION | TOT_ADDRESS | PERCENT |
|---|---|---|
| LOMBARDIA | 97181 | 0.2879710 |
| LAZIO | 32058 | 0.0949957 |
| CAMPANIA | 30570 | 0.0905864 |
| VENETO | 29696 | 0.0879965 |
| SICILIA | 28329 | 0.0839457 |
| PIEMONTE | 24377 | 0.0722350 |
ggplot(data = data3_region, aes(x = REGION,
y = TOT_ADDRESS)) +
geom_bar(stat = "identity",
fill = "steelblue") +
theme_minimal()
#controllo sui duplicati che risultano non essere presenti
knitr::kable(data4 %>%
summarise(TOT_ID_CLIs = n_distinct(ID_CLI),
TOT_ROWs = n()))
| TOT_ID_CLIs | TOT_ROWs |
|---|---|
| 369472 | 369472 |
#fattorizzazione per le variabili che lo richiedono
data4_complete <- data4 %>%
mutate(FLAG_PRIVACY_1 = as.factor(FLAG_PRIVACY_1)) %>%
mutate(FLAG_PRIVACY_2 = as.factor(FLAG_PRIVACY_2)) %>%
mutate(FLAG_DIRECT_MKT = as.factor(FLAG_DIRECT_MKT))
#analisi distribuzione variabile `FLAG_PRIVACY_1` mediante grafico
data4_privacy1<- data4_complete %>%
group_by(FLAG_PRIVACY_1) %>%
summarise(TOT_ID = n_distinct(ID_CLI)) %>%
mutate(PERCENT = TOT_ID/sum(TOT_ID)) %>%
arrange(desc(PERCENT))
knitr::kable(data4_privacy1)
| FLAG_PRIVACY_1 | TOT_ID | PERCENT |
|---|---|---|
| 1 | 242251 | 0.6556681 |
| 0 | 127221 | 0.3443319 |
ggplot(data = data4_privacy1,
aes(x = FLAG_PRIVACY_1,
y = TOT_ID)) +
geom_bar(stat = "identity",
fill = "steelblue") +
theme_minimal()
#analisi distribuzione varibile `FLAG_PRIVACY_2` mediante grafico
data4_privacy2 <- data4_complete %>%
group_by(FLAG_PRIVACY_2) %>%
summarise(TOT_ID = n_distinct(ID_CLI)) %>%
mutate(PERCENT = TOT_ID/sum(TOT_ID)) %>%
arrange(desc(PERCENT))
knitr::kable(data4_privacy2)
| FLAG_PRIVACY_2 | TOT_ID | PERCENT |
|---|---|---|
| 1 | 345682 | 0.9356108 |
| 0 | 23790 | 0.0643892 |
ggplot(data = data4_privacy2,
aes(x = FLAG_PRIVACY_2,
y = TOT_ID)) +
geom_bar(stat = "identity",
fill = "steelblue") +
theme_minimal()
#analisi distribuzione varibile `FLAG_DIRECT_MKT` mediante grafico
data4_privacy_mkt <- data4_complete %>%
group_by(FLAG_DIRECT_MKT) %>%
summarise(TOT_ID = n_distinct(ID_CLI)) %>%
mutate(PERCENT = TOT_ID/sum(TOT_ID)) %>%
arrange(desc(PERCENT))
knitr::kable(data4_privacy_mkt)
| FLAG_DIRECT_MKT | TOT_ID | PERCENT |
|---|---|---|
| 1 | 247790 | 0.6706598 |
| 0 | 121682 | 0.3293402 |
ggplot(data = data4_privacy_mkt,
aes(x = FLAG_DIRECT_MKT,
y = TOT_ID)) +
geom_bar(stat = "identity",
fill = "steelblue") +
theme_minimal()
#rimozione variabile `CHANNEL_CAMP` in quanto ritenuta non discriminante per l'analisi
data5_complete <- data5 %>%
select(-CHANNEL_CAMP)
#formattazione delle date
data6_complete <- data6 %>%
mutate(EVENT_DATETIME = as.POSIXct(EVENT_DATE,
format = "%Y-%m-%dT%H:%M:%S")) %>%
mutate(EVENT_HOUR = hour(EVENT_DATETIME)) %>%
mutate(EVENT_DATE = as.Date(EVENT_DATETIME))
#fattorizzazione della variabile `TYP_EVENT` unendo i livelli `E` e `B` insieme al livello `F`
data6_complete<- data6_complete %>%
mutate(TYP_EVENT = as.factor(if_else(TYP_EVENT == "E" | TYP_EVENT == "B",
"F", as.character(TYP_EVENT)))) %>%
left_join(data5_complete,
by = "ID_CAMP")
#aggiunta a ciascun evento di invio le aperture, i click o i fallimenti corrispondenti
d6_sends <- data6_complete %>%
filter(TYP_EVENT == "S") %>%
select(-TYP_EVENT) %>%
select(ID_EVENT_S = ID_EVENT,
ID_CLI,
ID_CAMP,
TYP_CAMP,
ID_DELIVERY,
SEND_DATE = EVENT_DATE) %>%
as.data.frame()
#si contano gli eventi aperti cercando di non contare la stessa apertura due volte, per questo si conta solo la prima
d6_opens_parz <- data6_complete %>%
filter(TYP_EVENT == "V") %>%
select(-TYP_EVENT) %>%
select(ID_EVENT_O = ID_EVENT,
ID_CLI,
ID_CAMP,
TYP_CAMP,
ID_DELIVERY,
OPEN_DATETIME = EVENT_DATETIME,
OPEN_DATE = EVENT_DATE) %>%
as.data.frame()
d6_total_opens <- d6_opens_parz %>%
group_by(ID_CLI,
ID_CAMP,
ID_DELIVERY) %>%
summarise(NUM_OPENs = n_distinct(ID_EVENT_O))
d6_opens <- d6_opens_parz %>%
left_join(d6_total_opens,
by = c("ID_CLI",
"ID_CAMP",
"ID_DELIVERY")) %>%
group_by(ID_CLI,
ID_CAMP,
ID_DELIVERY) %>%
filter(OPEN_DATETIME == min(OPEN_DATETIME)) %>%
filter(row_number() == 1) %>%
ungroup() %>%
as.data.frame()
#si contano i click nelle comunicazioni cercando di non contare piĂ¹ di un click, per questo si conta solo il primo
d6_clicks_parz <- data6_complete %>%
filter(TYP_EVENT == "C") %>%
select(-TYP_EVENT) %>%
select(ID_EVENT_C = ID_EVENT,
ID_CLI,
ID_CAMP,
TYP_CAMP,
ID_DELIVERY,
CLICK_DATETIME = EVENT_DATETIME,
CLICK_DATE = EVENT_DATE)
d6_total_clicks <- d6_clicks_parz %>%
group_by(ID_CLI,
ID_CAMP,
ID_DELIVERY) %>%
summarise(NUM_CLICKs = n_distinct(ID_EVENT_C))
d6_clicks <- d6_clicks_parz %>%
left_join(d6_total_clicks,
by = c("ID_CLI",
"ID_CAMP",
"ID_DELIVERY")) %>%
group_by(ID_CLI,
ID_CAMP,
ID_DELIVERY) %>%
filter(CLICK_DATETIME == min(CLICK_DATETIME)) %>%
filter(row_number() == 1) %>%
ungroup() %>%
as.data.frame()
#si contano i fallimenti
d6_fails <- data6_complete %>%
filter(TYP_EVENT == "F") %>%
select(-TYP_EVENT) %>%
select(ID_EVENT_F = ID_EVENT,
ID_CLI,
ID_CAMP,
TYP_CAMP,
ID_DELIVERY,
FAIL_DATETIME = EVENT_DATETIME,
FAIL_DATE = EVENT_DATE) %>%
group_by(ID_CLI,
ID_CAMP,
ID_DELIVERY) %>%
filter(FAIL_DATETIME == min(FAIL_DATETIME)) %>%
filter(row_number() == 1) %>%
ungroup() %>%
as.data.frame()
#si combinano tutte le informazioni ricavate precedentemente sugli eventi in un'unica variabile
data6_complete_finale <- d6_sends %>%
left_join(d6_opens,
by = c("ID_CLI",
"ID_CAMP",
"ID_DELIVERY",
"TYP_CAMP")) %>%
filter(is.na(OPEN_DATE) | SEND_DATE <= OPEN_DATE) %>%
left_join(d6_clicks,
by = c("ID_CLI",
"ID_CAMP",
"ID_DELIVERY",
"TYP_CAMP")) %>%
filter(is.na(CLICK_DATE) | OPEN_DATE <= CLICK_DATE) %>%
left_join(d6_fails,
by = c("ID_CLI",
"ID_CAMP",
"ID_DELIVERY",
"TYP_CAMP")) %>%
filter(is.na(FAIL_DATE) | SEND_DATE <= FAIL_DATE) %>%
mutate(OPENED = !is.na(ID_EVENT_O)) %>%
mutate(CLICKED = !is.na(ID_EVENT_C)) %>%
mutate(FAILED = !is.na(ID_EVENT_F)) %>%
mutate(DAYS_TO_OPEN = as.integer(OPEN_DATE - SEND_DATE)) %>%
select(ID_EVENT_S,
ID_CLI,
ID_CAMP,
TYP_CAMP,
ID_DELIVERY,
SEND_DATE,
OPENED,
OPEN_DATE,
DAYS_TO_OPEN,
NUM_OPENs,
CLICKED,
CLICK_DATE,
NUM_CLICKs,
FAILED)
#analisi panoramica del dataset
d6_overview <- data6_complete_finale %>%
summarise(MIN_DATE = min(SEND_DATE),
MAX_DATE = max(SEND_DATE),
TOT_EVENTs = n_distinct(ID_EVENT_S),
TOT_CLIs = n_distinct(ID_CLI))
knitr::kable(d6_overview)
| MIN_DATE | MAX_DATE | TOT_EVENTs | TOT_CLIs |
|---|---|---|---|
| 2019-01-03 | 2019-04-30 | 1556646 | 190427 |
#analisi distribuzione variabile `TYP_CAMP` tramite grafico
data6_typ_camp <- data6_complete_finale %>%
group_by(TYP_CAMP) %>%
summarise(MIN_DATE = min(SEND_DATE),
MAX_DATE = max(SEND_DATE),
TOT_EVENTs = n_distinct(ID_EVENT_S),
TOT_CLIs = n_distinct(ID_CLI))
knitr::kable(data6_typ_camp)
| TYP_CAMP | MIN_DATE | MAX_DATE | TOT_EVENTs | TOT_CLIs |
|---|---|---|---|---|
| LOCAL | 2019-02-02 | 2019-04-02 | 151719 | 87894 |
| NATIONAL | 2019-01-07 | 2019-04-23 | 833085 | 177153 |
| PERSONALIZED | 2019-01-03 | 2019-04-30 | 194840 | 133908 |
| PRODUCT | 2019-01-03 | 2019-04-25 | 377002 | 69724 |
ggplot(data = data6_typ_camp,
aes(x = TYP_CAMP,
y = TOT_EVENTs)) +
geom_bar(stat = "identity",
fill = "steelblue") +
theme_minimal()
#analisi distribuzione variabile `OPENED` mediante grafico
data6_opened <- data6_complete_finale %>%
group_by(OPENED) %>%
summarise(TOT_EVENTs = n_distinct(ID_EVENT_S),
TOT_CLIs = n_distinct(ID_CLI)) %>%
mutate(TYP_CAMP = 'ALL') %>%
mutate(PERCENT_EVENTs = TOT_EVENTs/d6_overview$TOT_EVENTs,
PERCENT_CLIs = TOT_CLIs/d6_overview$TOT_CLIs)
knitr::kable(data6_opened)
| OPENED | TOT_EVENTs | TOT_CLIs | TYP_CAMP | PERCENT_EVENTs | PERCENT_CLIs |
|---|---|---|---|---|---|
| FALSE | 1278264 | 178378 | ALL | 0.8211655 | 0.9367264 |
| TRUE | 278382 | 83420 | ALL | 0.1788345 | 0.4380681 |
ggplot(data = data6_opened,
aes(fill = OPENED,
x = TYP_CAMP,
y = TOT_EVENTs)) +
geom_bar(stat = "identity",
position = "fill") +
theme_minimal()
#analisi distribuzione della variabie riguardante le mail degli eventi aperte, tenendo in considerazione le tipologie della campagna, mediante grafici che mostrano in valore assoluto e in percentuale
data6_openedbytyp <- data6_complete_finale %>%
group_by(TYP_CAMP, OPENED) %>%
summarise(TOT_EVENTs = n_distinct(ID_EVENT_S),
TOT_CLIs = n_distinct(ID_CLI)) %>%
left_join(data6_typ_camp %>%
select(TYP_CAMP,
ALL_TOT_EVENTs = TOT_EVENTs,
ALL_TOT_CLIs = TOT_CLIs),
by = 'TYP_CAMP') %>%
mutate(PERCENT_EVENTs = TOT_EVENTs/ALL_TOT_EVENTs,
PERCENT_CLIs = TOT_CLIs/ALL_TOT_CLIs) %>%
select(TYP_CAMP,
OPENED,
TOT_EVENTs,
TOT_CLIs,
PERCENT_EVENTs,
PERCENT_CLIs)
knitr::kable(data6_openedbytyp)
| TYP_CAMP | OPENED | TOT_EVENTs | TOT_CLIs | PERCENT_EVENTs | PERCENT_CLIs |
|---|---|---|---|---|---|
| LOCAL | FALSE | 126700 | 76835 | 0.8350965 | 0.8741780 |
| LOCAL | TRUE | 25019 | 18029 | 0.1649035 | 0.2051221 |
| NATIONAL | FALSE | 710721 | 162049 | 0.8531194 | 0.9147404 |
| NATIONAL | TRUE | 122364 | 62964 | 0.1468806 | 0.3554216 |
| PERSONALIZED | FALSE | 156431 | 111942 | 0.8028690 | 0.8359620 |
| PERSONALIZED | TRUE | 38409 | 31327 | 0.1971310 | 0.2339442 |
| PRODUCT | FALSE | 284412 | 62356 | 0.7544045 | 0.8943262 |
| PRODUCT | TRUE | 92590 | 29605 | 0.2455955 | 0.4246027 |
ggplot(data = data6_openedbytyp,
aes(fill = OPENED,
x = TYP_CAMP,
y = TOT_EVENTs)) +
geom_bar(stat = "identity") +
theme_minimal() +
labs(title="APERTURA MAIL PER TIPOLOGIA DI CAMPAGNA")
ggplot(data = data6_openedbytyp,
aes(fill = OPENED,
x = TYP_CAMP,
y = TOT_EVENTs)) +
geom_bar(position = "fill",
stat = "identity") +
theme_minimal()
#analisi distribuzione della variabile `DAYS_TO_OPEN` mediante grafico
data6_daystoopen <- data6_complete_finale %>%
filter(OPENED) %>%
group_by(ID_CLI) %>%
summarise(AVG_DAYS_TO_OPEN = floor(mean(DAYS_TO_OPEN))) %>%
ungroup() %>%
group_by(AVG_DAYS_TO_OPEN) %>%
summarise(TOT_CLIs = n_distinct(ID_CLI))
knitr::kable(head(data6_daystoopen))
| AVG_DAYS_TO_OPEN | TOT_CLIs |
|---|---|
| 0 | 53094 |
| 1 | 12383 |
| 2 | 4906 |
| 3 | 3066 |
| 4 | 2147 |
| 5 | 1459 |
ggplot(data = data6_daystoopen %>%
filter(AVG_DAYS_TO_OPEN < 14),
aes(x = AVG_DAYS_TO_OPEN,
y = TOT_CLIs)) +
geom_bar(stat = "identity",
fill = "steelblue") +
theme_minimal()
data6_daystoopen_vs_cumulate <- data6_daystoopen %>%
arrange(AVG_DAYS_TO_OPEN) %>%
mutate(PERCENT_COVERED = cumsum(TOT_CLIs)/sum(TOT_CLIs))
ggplot(data = data6_daystoopen_vs_cumulate %>%
filter(AVG_DAYS_TO_OPEN < 14),
aes(x = AVG_DAYS_TO_OPEN,
y = PERCENT_COVERED)) +
geom_line() +
geom_point() +
scale_x_continuous(breaks = seq(0, 14, 2),
minor_breaks = 0:14) +
theme_minimal() +
labs(title="DISTRIBUZIONE GIORNI PRIMA DELL'APERTURA DELLA MAIL")
#analisi distribuzione della variabile `CLICKED`, in relazione alla tipologia di campagna, mediante grafici in valore assoluto e in percentuale
data6_clickedbytyp <- data6_complete_finale %>%
group_by(TYP_CAMP,
CLICKED) %>%
summarise(TOT_EVENTs = n_distinct(ID_EVENT_S),
TOT_CLIs = n_distinct(ID_CLI)) %>%
left_join(data6_typ_camp %>%
select(TYP_CAMP,
ALL_TOT_EVENTs = TOT_EVENTs,
ALL_TOT_CLIs = TOT_CLIs),
by='TYP_CAMP') %>%
mutate(PERCENT_EVENTs = TOT_EVENTs/ALL_TOT_EVENTs,
PERCENT_CLIs = TOT_CLIs/ALL_TOT_CLIs) %>%
select(TYP_CAMP,
CLICKED,
TOT_EVENTs,
TOT_CLIs,
PERCENT_EVENTs,
PERCENT_CLIs)
knitr::kable(data6_clickedbytyp)
| TYP_CAMP | CLICKED | TOT_EVENTs | TOT_CLIs | PERCENT_EVENTs | PERCENT_CLIs |
|---|---|---|---|---|---|
| LOCAL | FALSE | 150374 | 87332 | 0.9911349 | 0.9936059 |
| LOCAL | TRUE | 1345 | 1280 | 0.0088651 | 0.0145630 |
| NATIONAL | FALSE | 815796 | 175939 | 0.9792470 | 0.9931472 |
| NATIONAL | TRUE | 17289 | 14216 | 0.0207530 | 0.0802470 |
| PERSONALIZED | FALSE | 192741 | 133043 | 0.9892271 | 0.9935403 |
| PERSONALIZED | TRUE | 2099 | 2016 | 0.0107729 | 0.0150551 |
| PRODUCT | FALSE | 362551 | 69373 | 0.9616686 | 0.9949659 |
| PRODUCT | TRUE | 14451 | 8800 | 0.0383314 | 0.1262119 |
ggplot(data = data6_clickedbytyp,
aes(fill = CLICKED,
x = TYP_CAMP,
y = TOT_EVENTs)) +
geom_bar(stat = "identity") +
theme_minimal()
ggplot(data = data6_clickedbytyp,
aes(fill = CLICKED,
x = TYP_CAMP,
y = TOT_EVENTs)) +
geom_bar(position = "fill",
stat = "identity") +
theme_minimal()
#analisi distribuzione della variabile `FAILED`, in relazione alla tipologia di campagna, mediante grafici in valore assoluto e in percentuale
data6_failedbytyp <- data6_complete_finale %>%
group_by(TYP_CAMP,
FAILED) %>%
summarise(TOT_EVENTs = n_distinct(ID_EVENT_S),
TOT_CLIs = n_distinct(ID_CLI)) %>%
left_join(data6_typ_camp %>%
select(TYP_CAMP,
ALL_TOT_EVENTs = TOT_EVENTs,
ALL_TOT_CLIs = TOT_CLIs),
by = 'TYP_CAMP') %>%
mutate(PERCENT_EVENTs = TOT_EVENTs/ALL_TOT_EVENTs,
PERCENT_CLIs = TOT_CLIs/ALL_TOT_CLIs) %>% #-- Percentuale
select(TYP_CAMP,
FAILED,
TOT_EVENTs,
TOT_CLIs,
PERCENT_EVENTs,
PERCENT_CLIs)
knitr::kable(data6_failedbytyp)
| TYP_CAMP | FAILED | TOT_EVENTs | TOT_CLIs | PERCENT_EVENTs | PERCENT_CLIs |
|---|---|---|---|---|---|
| LOCAL | FALSE | 149189 | 86540 | 0.9833244 | 0.9845951 |
| LOCAL | TRUE | 2530 | 1653 | 0.0166756 | 0.0188067 |
| NATIONAL | FALSE | 814897 | 174425 | 0.9781679 | 0.9846009 |
| NATIONAL | TRUE | 18188 | 7045 | 0.0218321 | 0.0397679 |
| PERSONALIZED | FALSE | 192844 | 132774 | 0.9897557 | 0.9915315 |
| PERSONALIZED | TRUE | 1996 | 1690 | 0.0102443 | 0.0126206 |
| PRODUCT | FALSE | 371788 | 68960 | 0.9861698 | 0.9890425 |
| PRODUCT | TRUE | 5214 | 1729 | 0.0138302 | 0.0247978 |
ggplot(data = data6_failedbytyp,
aes(fill = FAILED,
x = TYP_CAMP,
y = TOT_EVENTs)) +
geom_bar(stat = "identity") +
theme_minimal()
ggplot(data = data6_failedbytyp,
aes(fill = FAILED,
x = TYP_CAMP,
y = TOT_EVENTs)) +
geom_bar(position = "fill",
stat = "identity") +
theme_minimal()
#analisi distribuzione della variabile `NUM_OPENs` mediante grafico
data6_numopens <- data6_complete_finale %>%
group_by(NUM_OPENs) %>%
summarise(TOT_ID = n_distinct(ID_EVENT_S)) %>%
mutate(PERCENT = TOT_ID/sum(TOT_ID)) %>%
arrange(desc(PERCENT))
knitr::kable(head(data6_numopens))
| NUM_OPENs | TOT_ID | PERCENT |
|---|---|---|
| NA | 1278264 | 0.8211655 |
| 1 | 220900 | 0.1419077 |
| 2 | 37877 | 0.0243324 |
| 3 | 11058 | 0.0071037 |
| 4 | 4002 | 0.0025709 |
| 5 | 1833 | 0.0011775 |
ggplot(data = data6_numopens,
aes(x = NUM_OPENs,
y = TOT_ID)) +
geom_bar(stat = "identity",
fill = "steelblue") +
xlim(0, 15) +
theme_minimal()
#analisi distribuzione della variabile `NUM_CLICKs` mediante grafico
data6_numclicks <- data6_complete_finale %>%
group_by(NUM_CLICKs) %>%
summarise(TOT_ID = n_distinct(ID_EVENT_S)) %>%
mutate(PERCENT = TOT_ID/sum(TOT_ID)) %>%
arrange(desc(PERCENT))
knitr::kable(head(data6_numclicks))
| NUM_CLICKs | TOT_ID | PERCENT |
|---|---|---|
| NA | 1521462 | 0.9773976 |
| 1 | 26912 | 0.0172885 |
| 2 | 5586 | 0.0035885 |
| 3 | 1604 | 0.0010304 |
| 4 | 593 | 0.0003809 |
| 5 | 263 | 0.0001690 |
ggplot(data = data6_numclicks,
aes(x = NUM_CLICKs,
y = TOT_ID)) +
geom_bar(stat = "identity",
fill = "steelblue") +
xlim(0, 15) +
theme_minimal()
#formattazione della data e fattorizzazione della variabile `DIREZIONE`
data7_complete <- data7 %>%
mutate(TIC_DATETIME = as.POSIXct(DATETIME,
format = "%Y-%m-%dT%H%M%S")) %>%
mutate(TIC_HOUR = hour(TIC_DATETIME)) %>%
mutate(TIC_DATE = as.Date(TIC_DATETIME)) %>%
select(-DATETIME) %>%
mutate(DIREZIONE = as.factor(DIREZIONE)) %>%
mutate(COD_REPARTO = as.factor(COD_REPARTO))
#creazione di features per differenziare tipologie di giorno
data7_complete <- data7_complete %>%
mutate(TIC_DATE_WEEKDAY = wday(TIC_DATE)) %>%
mutate(TIC_DATE_HOLIDAY = isHoliday("Italy", TIC_DATE)) %>%
mutate(TIC_DATE_TYP = case_when(
(TIC_DATE_WEEKDAY %in% c(6,7)) ~ "weekend",
(TIC_DATE_HOLIDAY == TRUE) ~ "holiday",
(TIC_DATE_WEEKDAY < 7) ~ "weekday",
TRUE ~ "other"))
#analisi della variabile `TIC_DATE`
d7_panoramica <- data7_complete %>%
summarise(MIN_DATE = min(TIC_DATE),
MAX_DATE = max(TIC_DATE),
TOT_TICs = n_distinct(ID_SCONTRINO),
TOT_CLIs = n_distinct(ID_CLI))
knitr::kable(d7_panoramica)
| MIN_DATE | MAX_DATE | TOT_TICs | TOT_CLIs |
|---|---|---|---|
| 2018-05-01 | 2019-04-30 | 998035 | 212124 |
#analisi distribuzione della variabile `DIREZIONE` di cui quasi il 91% dei casi sono acquisti
data7_direction <- data7_complete %>%
group_by(DIREZIONE) %>%
summarise(TOT_TICs = n_distinct(ID_SCONTRINO),
TOT_CLIs = n_distinct(ID_CLI)) %>%
mutate(PERCENT_TICs = TOT_TICs/d7_panoramica$TOT_TICs,
PERCENT_CLIs = TOT_CLIs/d7_panoramica$TOT_CLIs)
knitr::kable(data7_direction)
| DIREZIONE | TOT_TICs | TOT_CLIs | PERCENT_TICs | PERCENT_CLIs |
|---|---|---|---|---|
| -1 | 90189 | 46622 | 0.0903666 | 0.2197865 |
| 1 | 907846 | 212124 | 0.9096334 | 1.0000000 |
#analisi distribuzione della variabile `DIREZIONE` secondo la variabile `TIC_HOURS` mediante grafico in numeri assoluti e in percentuale
data7_hour <- data7_complete %>%
group_by(TIC_HOUR, DIREZIONE) %>%
summarise(TOT_TICs = n_distinct(ID_SCONTRINO),
TOT_CLIs = n_distinct(ID_CLI)) %>%
left_join(data7_direction %>%
select(DIREZIONE,
ALL_TOT_TICs = TOT_TICs,
ALL_TOT_CLIs = TOT_CLIs),
by = 'DIREZIONE') %>%
mutate(PERCENT_TICs = TOT_TICs/ALL_TOT_TICs,
PERCENT_CLIs = TOT_CLIs/ALL_TOT_CLIs) %>%
select(-ALL_TOT_TICs,
-ALL_TOT_CLIs)
knitr::kable(head(data7_hour))
| TIC_HOUR | DIREZIONE | TOT_TICs | TOT_CLIs | PERCENT_TICs | PERCENT_CLIs |
|---|---|---|---|---|---|
| 3 | 1 | 2 | 2 | 0.0000022 | 0.0000094 |
| 4 | 1 | 4 | 4 | 0.0000044 | 0.0000189 |
| 5 | 1 | 2 | 2 | 0.0000022 | 0.0000094 |
| 6 | 1 | 32 | 32 | 0.0000352 | 0.0001509 |
| 7 | -1 | 759 | 638 | 0.0084157 | 0.0136845 |
| 7 | 1 | 9249 | 6309 | 0.0101879 | 0.0297420 |
ggplot(data = data7_hour,
aes(fill = DIREZIONE,
x = TIC_HOUR,
y = TOT_TICs)) +
geom_bar(stat = "identity") +
theme_minimal() +
labs(title="DISTRIBUZIONE ACQUISTI PER ORA")
ggplot(data = data7_hour,
aes(fill = DIREZIONE,
x = TIC_HOUR,
y = TOT_TICs)) +
geom_bar(stat = "identity",
position = "fill" ) +
theme_minimal()
#analisi distribuzione della variabile `DIREZIONE` secondo la variabile `COD_REPARTO` mediante grafici in termini assoluti e percentuali
data7_rep <- data7_complete %>%
group_by(COD_REPARTO, DIREZIONE) %>%
summarise(TOT_TICs = n_distinct(ID_SCONTRINO),
TOT_CLIs = n_distinct(ID_CLI)) %>%
left_join(data7_direction %>%
select(DIREZIONE,
ALL_TOT_TICs = TOT_TICs,
ALL_TOT_CLIs = TOT_CLIs),
by = 'DIREZIONE') %>%
mutate(PERCENT_TICs = TOT_TICs/ALL_TOT_TICs,
PERCENT_CLIs = TOT_CLIs/ALL_TOT_CLIs) %>%
select(-ALL_TOT_TICs, -ALL_TOT_CLIs)
knitr::kable(head(data7_rep))
| COD_REPARTO | DIREZIONE | TOT_TICs | TOT_CLIs | PERCENT_TICs | PERCENT_CLIs |
|---|---|---|---|---|---|
| 1 | -1 | 2512 | 1928 | 0.0278526 | 0.0413539 |
| 1 | 1 | 65151 | 30531 | 0.0717644 | 0.1439300 |
| 2 | -1 | 5604 | 4598 | 0.0621362 | 0.0986230 |
| 2 | 1 | 94618 | 54055 | 0.1042225 | 0.2548274 |
| 3 | -1 | 17083 | 10835 | 0.1894133 | 0.2324010 |
| 3 | 1 | 229972 | 93224 | 0.2533161 | 0.4394788 |
ggplot(data = data7_rep,
aes(fill = DIREZIONE,
x = COD_REPARTO,
y = TOT_TICs)) +
geom_bar(stat = "identity") +
theme_minimal()
ggplot(data = data7_rep,
aes(fill = DIREZIONE,
x = COD_REPARTO,
y = TOT_TICs)) +
geom_bar(stat = "identity",
position = "fill" ) +
theme_minimal()
#analisi distribuzione della variabile `DIREZIONE` secondo la variabile `TIC_DATE_TYP` mediante grafici in termini assoluti e percentuali
data7_datetyp <- data7_complete %>%
group_by(TIC_DATE_TYP, DIREZIONE) %>%
summarise(TOT_TICs = n_distinct(ID_SCONTRINO),
TOT_CLIs = n_distinct(ID_CLI)) %>%
left_join(data7_direction %>%
select(DIREZIONE,
ALL_TOT_TICs = TOT_TICs,
ALL_TOT_CLIs = TOT_CLIs),
by = 'DIREZIONE') %>%
mutate(PERCENT_TICs = TOT_TICs/ALL_TOT_TICs,
PERCENT_CLIs = TOT_CLIs/ALL_TOT_CLIs) %>%
select(-ALL_TOT_TICs, -ALL_TOT_CLIs)
knitr::kable(data7_datetyp)
| TIC_DATE_TYP | DIREZIONE | TOT_TICs | TOT_CLIs | PERCENT_TICs | PERCENT_CLIs |
|---|---|---|---|---|---|
| holiday | -1 | 14522 | 10759 | 0.1610174 | 0.2307709 |
| holiday | 1 | 157868 | 81742 | 0.1738929 | 0.3853501 |
| weekday | -1 | 47294 | 28981 | 0.5243877 | 0.6216164 |
| weekday | 1 | 452679 | 153338 | 0.4986297 | 0.7228696 |
| weekend | -1 | 28373 | 19565 | 0.3145949 | 0.4196517 |
| weekend | 1 | 297299 | 125137 | 0.3274773 | 0.5899238 |
ggplot(data = data7_datetyp,
aes(fill = DIREZIONE,
x = TIC_DATE_TYP,
y = TOT_TICs)) +
geom_bar(stat = "identity") +
theme_minimal()
ggplot(data = data7_datetyp,
aes(fill = DIREZIONE,
x = TIC_DATE_TYP,
y = TOT_TICs)) +
geom_bar(stat = "identity",
position = "fill" ) +
theme_minimal()
#analisi distribuzione della variabile `DIREZIONE` secondo la variabili `IMPORTO_LORDO` e `SCONTO` mediante grafico
data7_importosconto <- data7_complete %>%
group_by(ID_SCONTRINO, DIREZIONE) %>%
summarise(IMPORTO_LORDO = sum(IMPORTO_LORDO),
SCONTO = sum(SCONTO)) %>%
ungroup() %>%
as.data.frame()
data7_avgimportosconto <- data7_importosconto %>%
group_by(DIREZIONE) %>%
summarise(AVG_IMPORTO_LORDO = mean(IMPORTO_LORDO),
AVG_SCONTO = mean(SCONTO))
ggplot(data = data7_importosconto %>%
filter((IMPORTO_LORDO > -1000) & (IMPORTO_LORDO < 1000)),
aes(color = DIREZIONE,
x = IMPORTO_LORDO)) +
geom_histogram(binwidth = 10,
fill = "white",
alpha = 0.5) +
theme_minimal()
ggplot(data = data7_importosconto %>%
filter((SCONTO > -250) & (IMPORTO_LORDO < 250)),
aes(color = DIREZIONE, x = SCONTO)) +
geom_histogram(binwidth = 10,
fill = "white",
alpha = 0.5) +
theme_minimal()
#analisi distribuzione della variabile `DIREZIONE` secondo la variabili `IMPORTO_LORDO`,`SCONTO` e `COD_REPARTO` mediante grafico
data7_importosconto_cod_rep <- data7_complete %>%
group_by(COD_REPARTO, DIREZIONE) %>%
summarise(IMPORTO_LORDO = sum(IMPORTO_LORDO),
SCONTO = sum(SCONTO)) %>%
ungroup() %>%
as.data.frame()
knitr::kable(head(data7_importosconto_cod_rep))
| COD_REPARTO | DIREZIONE | IMPORTO_LORDO | SCONTO |
|---|---|---|---|
| 1 | -1 | -157326.2 | -11092.76 |
| 1 | 1 | 3043725.9 | 145225.84 |
| 2 | -1 | -2016555.4 | -173055.40 |
| 2 | 1 | 30021937.8 | 2716917.27 |
| 3 | -1 | -681823.8 | -45169.53 |
| 3 | 1 | 8168067.1 | 348269.70 |
ggplot(data = data7_importosconto_cod_rep,
aes(fill = DIREZIONE,
x = COD_REPARTO,
y = IMPORTO_LORDO)) +
geom_bar(stat = "identity") +
theme_minimal()
ggplot(data = data7_importosconto_cod_rep,
aes(fill = DIREZIONE,
x = COD_REPARTO,
y = SCONTO)) +
geom_bar(stat = "identity") +
theme_minimal()
#analisi distribuzione della variabile `DIREZIONE` secondo la variabile `ID_ARTICOLO`
data7_complete$ID_ARTICOLO <- as.factor(data7_complete$ID_ARTICOLO)
data7_id_articolo <- data7_complete %>%
filter(DIREZIONE == 1) %>%
group_by(ID_ARTICOLO) %>%
summarise(NUM_VENDITE = n_distinct(ID_SCONTRINO)) %>%
ungroup() %>%
as.data.frame() %>%
arrange(desc(NUM_VENDITE))
knitr::kable(head(data7_id_articolo))
| ID_ARTICOLO | NUM_VENDITE |
|---|---|
| 33700716 | 57806 |
| 33817091 | 24691 |
| 34843564 | 12804 |
| 32882024 | 6531 |
| 34252904 | 4788 |
| 35209202 | 4758 |
#analisi distribuzione della variabile `DIREZIONE` secondo la variabile `ID_CLIENTE`
data7_importosconto_id_cli <- data7_complete %>%
filter(DIREZIONE == 1) %>%
group_by(ID_CLI) %>%
summarise(IMPORTO_LORDO = sum(IMPORTO_LORDO),
SCONTO = sum(SCONTO)) %>%
ungroup() %>%
as.data.frame() %>%
arrange(desc(IMPORTO_LORDO))
knitr::kable(head(data7_importosconto_id_cli))
| ID_CLI | IMPORTO_LORDO | SCONTO |
|---|---|---|
| 572977 | 421636.2 | 40609.00 |
| 410777 | 254098.7 | 35618.50 |
| 562939 | 202575.7 | 31966.13 |
| 224908 | 187350.9 | 18708.09 |
| 229720 | 180023.4 | 17738.48 |
| 96592 | 179460.3 | 17913.08 |
#analisi distribuzione della total purchase
data7_tot_purch <- data7_complete %>%
filter(DIREZIONE == 1) %>%
group_by(ID_CLI) %>%
summarise(TOT_PURCHASE = n_distinct(ID_SCONTRINO)) %>%
arrange(desc(TOT_PURCHASE))
knitr::kable(head(data7_tot_purch))
| ID_CLI | TOT_PURCHASE |
|---|---|
| 376925 | 177 |
| 117212 | 155 |
| 248975 | 154 |
| 341579 | 149 |
| 99384 | 147 |
| 362177 | 146 |
#analisi della total purchse curve mediante grafico
data_next_purchase <- data7_complete %>%
filter(DIREZIONE == 1) %>%
select(ID_CLI,
ID_ARTICOLO,
TIC_DATE,
DIREZIONE) %>%
arrange(ID_CLI)
memory.limit(10000000)
## [1] 1e+07
d7_nextpurchase <- data_next_purchase %>%
group_by(ID_CLI) %>%
mutate(Diff = TIC_DATE - lag(TIC_DATE))
x <- as.data.frame(table(d7_nextpurchase$Diff))
x <- x[-1, ]
x$Perc <- x$Freq/sum(x$Freq)
ggplot(x,
aes(x = as.numeric(Var1), y = cumsum(Perc))) +
xlim(0, 100) +
geom_line()
L’analisi Recency, Frequency, Monetary (RFM) è un’analisi di marketing che utilizza appunto queste 3 metriche per segmentare la base dei clienti. La Recency valuta quanto recentemente il client ha fatto un acquisto, la Frequency valuta quanto spesso un cliente fa un acquisto e la Monetary quanti soldi spende il cliente per un acquisto.
Si selezionano i clienti attivi, ovvero quelli che hanno effettuato il loro ultimo acquisto dopo il 01/01/2019:
rfm_periodo_studio <- data7_complete %>%
filter(TIC_DATE > as.Date("01/01/2019",
format = "%d/%m/%Y"))
knitr::kable(head(rfm_periodo_studio))
| ID_SCONTRINO | ID_CLI | ID_NEG | ID_ARTICOLO | COD_REPARTO | DIREZIONE | IMPORTO_LORDO | SCONTO | TIC_DATETIME | TIC_HOUR | TIC_DATE | TIC_DATE_WEEKDAY | TIC_DATE_HOLIDAY | TIC_DATE_TYP |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 51119494802/01/19710359211860 | 669413 | 39 | 36146131 | 6 | 1 | 8.9 | 0.00 | 2019-01-02 07:10:35 | 7 | 2019-01-02 | 4 | FALSE | weekday |
| 51119494802/01/19710359211860 | 669413 | 39 | 36505196 | 10 | 1 | 3.6 | 0.00 | 2019-01-02 07:10:35 | 7 | 2019-01-02 | 4 | FALSE | weekday |
| 511115504902/01/19716539383669 | 754326 | 39 | 36472121 | 7 | 1 | 249.0 | 0.00 | 2019-01-02 07:16:53 | 7 | 2019-01-02 | 4 | FALSE | weekday |
| 511115505002/01/19720238594281 | 358614 | 39 | 34584053 | 1 | 1 | 126.0 | 12.60 | 2019-01-02 07:20:23 | 7 | 2019-01-02 | 4 | FALSE | weekday |
| 511115505002/01/19720238594281 | 358614 | 39 | 34584263 | 1 | 1 | 137.5 | 13.75 | 2019-01-02 07:20:23 | 7 | 2019-01-02 | 4 | FALSE | weekday |
| 511115505002/01/19720238594281 | 358614 | 39 | 33370323 | 11 | 1 | 239.6 | 23.96 | 2019-01-02 07:20:23 | 7 | 2019-01-02 | 4 | FALSE | weekday |
#raggruppo per cliente e calcolo la recency
rfm_recency <- rfm_periodo_studio %>%
filter(DIREZIONE == 1) %>%
group_by(ID_CLI) %>%
summarise(LAST_PURCHASE_DATE = max(TIC_DATE))
rfm_recency$RECENCY <- difftime(as.Date("30/04/2019",
format = "%d/%m/%Y"),
rfm_recency$LAST_PURCHASE_DATE,
units = "days")
knitr::kable(head(rfm_recency))
| ID_CLI | LAST_PURCHASE_DATE | RECENCY |
|---|---|---|
| 23 | 2019-02-20 | 69 days |
| 32 | 2019-04-02 | 28 days |
| 33 | 2019-03-02 | 59 days |
| 48 | 2019-04-30 | 0 days |
| 50 | 2019-02-19 | 70 days |
| 56 | 2019-03-11 | 50 days |
#si dividono i clienti in base ai percentili identificando il valore di recency in 3 categorie: 0-25, 25-75, 75-100. Si visualizza la distribuzione mediante grafico.
rfm_recency <- within(rfm_recency,
REC_CLASS <- cut(as.numeric(rfm_recency$RECENCY),
breaks = quantile(rfm_recency$RECENCY,
probs = c(0, .25, .75, 1)),
include.lowest = T,
labels = c("low", "medium", "high")))
rec_label <- as.data.frame(table(rfm_recency$REC_CLASS))
ggplot(data = rec_label,
aes(x = Var1, y = Freq,
fill = Freq)) +
geom_bar(stat = "identity") +
labs(title = "Recency Distribution",
x = "Recency Classes",
y = "Total Purchase") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5)) +
scale_x_discrete(labels = c("Low", "Medium", "High")) +
guides(fill = FALSE)
#calcolo della frequency per cliente
rfm_frequency <- rfm_periodo_studio %>%
filter(DIREZIONE == 1) %>%
group_by(ID_CLI) %>%
summarise(TOT_PURCHASE = n_distinct(ID_SCONTRINO)) %>%
arrange(desc(TOT_PURCHASE))
knitr::kable(head(rfm_frequency))
| ID_CLI | TOT_PURCHASE |
|---|---|
| 726098 | 101 |
| 756039 | 92 |
| 740391 | 74 |
| 341579 | 73 |
| 3849 | 71 |
| 776800 | 67 |
#si dividono i clienti in base ai percentili identificando il valore di frequency in 3 categorie: 0-25, 25-75, 75-100. Si visualizza la distribuzione mediante grafico
rfm_frequency <- within(rfm_frequency,
FREQ_CLASS <- cut(rfm_frequency$TOT_PURCHASE,
breaks = c(0, 2, 5, 101),
include.lowest = T,
right = F,
labels = c("low", "medium", "high")))
table(rfm_frequency$FREQ_CLASS)
##
## low medium high
## 52169 44097 16840
freq_label <- as.data.frame(table(rfm_frequency$FREQ_CLASS))
ggplot(data = freq_label,
aes(x = Var1, y = Freq,
fill = Freq)) +
geom_bar(stat = "identity") +
labs(title = "Frequency Distribution",
x = "Frequency Classes",
y = "Total Purchase") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5)) +
scale_x_discrete(labels = c("Low", "Medium", "High")) +
guides(fill = FALSE)
#calcolo monetary per ogni cliente
rfm_monetary <- rfm_periodo_studio %>%
filter(DIREZIONE == 1) %>%
group_by(ID_CLI) %>%
summarise(IMPORTO_LORDO = sum(IMPORTO_LORDO),
SCONTO = sum(SCONTO),
SPESA = IMPORTO_LORDO - SCONTO) %>%
ungroup() %>%
as.data.frame() %>%
arrange(desc(IMPORTO_LORDO))
knitr::kable(head(rfm_monetary))
| ID_CLI | IMPORTO_LORDO | SCONTO | SPESA |
|---|---|---|---|
| 410777 | 254098.70 | 35618.50 | 218480.20 |
| 562939 | 198332.22 | 31602.10 | 166730.12 |
| 96592 | 174311.92 | 17426.00 | 156885.92 |
| 600594 | 113035.14 | 11411.11 | 101624.03 |
| 4339 | 109380.64 | 15797.16 | 93583.48 |
| 787381 | 97176.15 | 9907.49 | 87268.66 |
#si dividono i clienti in base ai percentili identificando il valore di monetary in 3 categorie: 0-25, 25-75, 75-100. Si visualizza la distribuzione mediante grafico
rfm_monetary <- within(rfm_monetary,
MON_CLASS <- cut(rfm_monetary$SPESA,
breaks = quantile(rfm_monetary$SPESA,
probs = c(0, .25, .75, 1)),
include.lowest = T,
labels = c("low", "medium", "high")))
table(rfm_monetary$MON_CLASS)
##
## low medium high
## 28277 56553 28276
mon_label <- as.data.frame(table(rfm_monetary$MON_CLASS))
ggplot(data = mon_label,
aes(x = Var1, y = Freq,
fill = Freq)) +
geom_bar(stat = "identity") +
labs(title = "Monetary Distribution",
x = "Monetary Classes",
y = "Total Amount") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5)) +
scale_x_discrete(labels = c("Low", "Medium", "High")) +
guides(fill = FALSE)
Creazione del dataset contenente tutte e 3 le metriche
#si uniscono i 3 dataset tramite 2 merge
rfm <- merge(rfm_frequency,
rfm_monetary,
by = "ID_CLI")
rfm <- merge(rfm,
rfm_recency,
by = "ID_CLI")
knitr::kable(head(rfm))
| ID_CLI | TOT_PURCHASE | FREQ_CLASS | IMPORTO_LORDO | SCONTO | SPESA | MON_CLASS | LAST_PURCHASE_DATE | RECENCY | REC_CLASS |
|---|---|---|---|---|---|---|---|---|---|
| 23 | 1 | low | 38.07 | 0.00 | 38.07 | low | 2019-02-20 | 69 days | medium |
| 32 | 5 | high | 903.61 | 0.00 | 903.61 | high | 2019-04-02 | 28 days | medium |
| 33 | 1 | low | 48.45 | 0.00 | 48.45 | medium | 2019-03-02 | 59 days | medium |
| 48 | 12 | high | 2135.95 | 306.65 | 1829.30 | high | 2019-04-30 | 0 days | low |
| 50 | 2 | medium | 437.25 | 62.56 | 374.69 | high | 2019-02-19 | 70 days | medium |
| 56 | 4 | medium | 586.49 | 66.30 | 520.19 | high | 2019-03-11 | 50 days | medium |
#combinazione delle categorie recency e frequency per definire piĂ¹ accuratamente loyalty satus dei clienti
rfm$RF <- NA
for(i in c(1:nrow(rfm))){
if(rfm$REC_CLASS[i] == "low" && rfm$FREQ_CLASS[i] == "low") rfm$RF[i] <- "One-Timer"
if(rfm$REC_CLASS[i] == "medium" && rfm$FREQ_CLASS[i] == "low") rfm$RF[i] <- "One-Timer"
if(rfm$REC_CLASS[i] == "high" && rfm$FREQ_CLASS[i] == "low") rfm$RF[i] <- "Leaving"
if(rfm$REC_CLASS[i] == "low" && rfm$FREQ_CLASS[i] == "medium") rfm$RF[i] <- "Engaged"
if(rfm$REC_CLASS[i] == "medium" && rfm$FREQ_CLASS[i] == "medium") rfm$RF[i] <- "Engaged"
if(rfm$REC_CLASS[i] == "high" && rfm$FREQ_CLASS[i] == "medium") rfm$RF[i] <- "Leaving"
if(rfm$REC_CLASS[i] == "low" && rfm$FREQ_CLASS[i] == "high") rfm$RF[i] <- "Top"
if(rfm$REC_CLASS[i] == "medium" && rfm$FREQ_CLASS[i] == "high") rfm$RF[i] <- "Top"
if(rfm$REC_CLASS[i] == "high" && rfm$FREQ_CLASS[i] == "high") rfm$RF[i] <- "Leaving Top"
}
table(rfm$RF)
##
## Engaged Leaving Leaving Top One-Timer Top
## 36316 27187 592 32763 16248
Creiamo il grafico per la variabile RF:
#creata variabile recency-frequency e analisi della sua distribuzione mediante grafico
rf_data <- as.data.frame(rbind(c("Top", "High", "Low", 16248),
c("Top", "High", "Medium", 16248),
c("Leaving Top", "High", "High", 592),
c("Engaged", "Medium", "Low", 36316),
c("Engaged", "Medium", "Medium", 36316),
c("Leaving", "Medium", "High", 27187),
c("One Timer", "Low", "Low", 32763),
c("One Timer", "Low", "Medium", 32763),
c("Leaving", "Low", "High", 27187)))
colnames(rf_data) <- c("Level", "Frequency", "Recency", "Value")
rf_data$Frequency <- factor(rf_data$Frequency,
levels = c("High", "Medium", "Low"))
rf_data$Recency <- factor(rf_data$Recency,
levels = c("High", "Medium", "Low"))
rf_data$Value <- as.numeric(rf_data$Value)
ggplot(rf_data, aes(x = Frequency, y = Recency, fill = Value)) +
geom_tile() +
geom_text(aes(label = Level)) +
scale_fill_distiller(palette = "Spectral")+
theme_minimal()+
labs(title="TABELLA RF")
rf <- as.data.frame(table(rfm$RF))
rf
## Var1 Freq
## 1 Engaged 36316
## 2 Leaving 27187
## 3 Leaving Top 592
## 4 One-Timer 32763
## 5 Top 16248
ggplot(data = rf,
aes(x = Var1, y = Freq,
fill = Freq)) +
geom_bar(stat = "identity") +
scale_colour_brewer(palette = "RdBu") +
labs(title = "DISTRIBUZIONE CLASSI RF",
x = "RF Classes",
y = "Total Clients") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5)) +
scale_x_discrete(labels = c("Engaged", "Leaving", "Leaving Top",
"One Timer", "Top")) +
guides(fill = FALSE)
Infine, le classi RFM sono ottenute combinando le classi RF con i gruppi di monetary:
#creazione della variabile rfm che combina le classi della variabile recency-frequency con le categorie di monetary
rfm$RFM <- NA
for(i in c(1:nrow(rfm))){
if(rfm$RF[i] == "One-Timer" && rfm$MON_CLASS[i] == "low") rfm$RFM[i] <- "Cheap"
if(rfm$RF[i] == "Leaving" && rfm$MON_CLASS[i] == "low") rfm$RFM[i] <- "Tin"
if(rfm$RF[i] == "Engaged" && rfm$MON_CLASS[i] == "low") rfm$RFM[i] <- "Copper"
if(rfm$RF[i] == "Leaving Top" && rfm$MON_CLASS[i] == "low") rfm$RFM[i] <- "Bronze"
if(rfm$RF[i] == "Top" && rfm$MON_CLASS[i] == "low") rfm$RFM[i] <- "Silver"
if(rfm$RF[i] == "One-Timer" && rfm$MON_CLASS[i] == "medium") rfm$RFM[i] <- "Tin"
if(rfm$RF[i] == "Leaving" && rfm$MON_CLASS[i] == "medium") rfm$RFM[i] <- "Copper"
if(rfm$RF[i] == "Engaged" && rfm$MON_CLASS[i] == "medium") rfm$RFM[i] <- "Bronze"
if(rfm$RF[i] == "Leaving Top" && rfm$MON_CLASS[i] == "medium") rfm$RFM[i] <- "Silver"
if(rfm$RF[i] == "Top" && rfm$MON_CLASS[i] == "medium") rfm$RFM[i] <- "Gold"
if(rfm$RF[i] == "One-Timer" && rfm$MON_CLASS[i] == "high") rfm$RFM[i] <- "Copper"
if(rfm$RF[i] == "Leaving" && rfm$MON_CLASS[i] == "high") rfm$RFM[i] <- "Bronze"
if(rfm$RF[i] == "Engaged" && rfm$MON_CLASS[i] == "high") rfm$RFM[i] <- "Silver"
if(rfm$RF[i] == "Leaving Top" && rfm$MON_CLASS[i] == "high") rfm$RFM[i] <- "Gold"
if(rfm$RF[i] == "Top" && rfm$MON_CLASS[i] == "high") rfm$RFM[i] <- "Diamond"
}
#analisi della distribuzione della variabile rfm appena creata mediante grafico
rfm_data <- as.data.frame(rbind(c("Top", "High", "Diamond", 10984),
c("Top", "Medium", "Gold", 5585),
c("Top", "Low", "Silver", 10306),
c("Leaving Top", "High", "Gold", 5585),
c("Leaving Top", "Medium", "Silver", 10306),
c("Leaving Top", "Low", "Bronze", 25932),
c("Engaged", "High", "Silver", 10306),
c("Engaged", "Medium", "Bronze", 25932),
c("Engaged", "Low", "Copper", 20938),
c("Leaving", "High", "Bronze", 25932),
c("Leaving", "Medium", "Copper", 20938),
c("Leaving", "Low", "Tin", 24967),
c("One Timer", "High", "Copper", 20938),
c("One Timer", "Medium", "Tin", 24967),
c("One Timer", "Low", "Cheap", 14394)))
colnames(rfm_data) <- c("RF", "Monetary", "Level", "Value")
rfm_data$RF <- factor(rfm_data$RF,
levels = c("Top", "Leaving Top",
"Engaged", "Leaving", "One Timer"))
rfm_data$Monetary <- factor(rfm_data$Monetary,
levels = c("Low", "Medium", "High"))
rfm_data$Value <- as.numeric(rfm_data$Value)
ggplot(rfm_data, aes(x = RF, y = Monetary, fill = Value)) +
geom_tile() +
geom_text(aes(label = Level)) +
scale_fill_distiller(palette = "RdBu") +
theme_minimal()+
labs(title="TABELLA RFM")
rfm_plot <- as.data.frame(table(rfm$RFM))
ggplot(data = rfm_plot,
aes(x = Var1, y = Freq,
fill = Freq)) +
geom_bar(stat = "identity") +
scale_colour_brewer(palette = "Set1") +
labs(title = "DISTRIBUZIONE CLASSI RFM",
x = "RFM Classes",
y = "Total Clients") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5)) +
scale_x_discrete(labels = c("Bronze", "Cheap", "Copper", "Diamond",
"Gold", "Silver", "Tin")) +
guides(fill = FALSE)
Churn rate misura il tasso di clienti che sono già abbonati per un prodotto o un servizio e disdicono l’abbonamento entro un certo lasso di tempo.
#si seleziona il periodo dal 01/10/2018 al 01/01/2019 come data di riferimento
churn_periodo_studio <- data7_complete %>%
filter(DIREZIONE == 1,
TIC_DATE < as.Date("1/1/2019",
format = "%d/%m/%Y"),
TIC_DATE > as.Date("01/10/2018",
format = "%d/%m/%Y"))
knitr::kable(head(churn_periodo_studio))
| ID_SCONTRINO | ID_CLI | ID_NEG | ID_ARTICOLO | COD_REPARTO | DIREZIONE | IMPORTO_LORDO | SCONTO | TIC_DATETIME | TIC_HOUR | TIC_DATE | TIC_DATE_WEEKDAY | TIC_DATE_HOLIDAY | TIC_DATE_TYP |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 513715359302/10/18708198952096 | 538298 | 15 | 48020112 | 2 | 1 | 224.14 | 0.00 | 2018-10-02 07:08:19 | 7 | 2018-10-02 | 3 | FALSE | weekday |
| 512310497402/10/18710039040884 | 585553 | 29 | 35554932 | 1 | 1 | 90.10 | 16.15 | 2018-10-02 07:10:03 | 7 | 2018-10-02 | 3 | FALSE | weekday |
| 512310497402/10/18710039040884 | 585553 | 29 | 81256071 | 1 | 1 | 59.13 | 15.93 | 2018-10-02 07:10:03 | 7 | 2018-10-02 | 3 | FALSE | weekday |
| 51521089902/10/18711052729620 | 35501 | 4 | 36137402 | 7 | 1 | 50.00 | 5.00 | 2018-10-02 07:11:05 | 7 | 2018-10-02 | 3 | FALSE | weekday |
| 51521089902/10/18711052729620 | 35501 | 4 | 36137472 | 7 | 1 | 130.00 | 13.00 | 2018-10-02 07:11:05 | 7 | 2018-10-02 | 3 | FALSE | weekday |
| 51521089902/10/18711052729620 | 35501 | 4 | 36137605 | 7 | 1 | 130.00 | 13.00 | 2018-10-02 07:11:05 | 7 | 2018-10-02 | 3 | FALSE | weekday |
#si seleziona il periodo di holdout: 01/01/2019 - 28/02/2019
churn_holdout <- data7_complete %>%
filter(DIREZIONE == 1,
TIC_DATE < as.Date("28/02/2019",
format = "%d/%m/%Y"),
TIC_DATE > as.Date("01/01/2019",
format = "%d/%m/%Y"))
no_churner <- unique(churn_holdout$ID_CLI)
knitr::kable(head(churn_holdout))
| ID_SCONTRINO | ID_CLI | ID_NEG | ID_ARTICOLO | COD_REPARTO | DIREZIONE | IMPORTO_LORDO | SCONTO | TIC_DATETIME | TIC_HOUR | TIC_DATE | TIC_DATE_WEEKDAY | TIC_DATE_HOLIDAY | TIC_DATE_TYP |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 51119494802/01/19710359211860 | 669413 | 39 | 36146131 | 6 | 1 | 8.9 | 0.00 | 2019-01-02 07:10:35 | 7 | 2019-01-02 | 4 | FALSE | weekday |
| 51119494802/01/19710359211860 | 669413 | 39 | 36505196 | 10 | 1 | 3.6 | 0.00 | 2019-01-02 07:10:35 | 7 | 2019-01-02 | 4 | FALSE | weekday |
| 511115504902/01/19716539383669 | 754326 | 39 | 36472121 | 7 | 1 | 249.0 | 0.00 | 2019-01-02 07:16:53 | 7 | 2019-01-02 | 4 | FALSE | weekday |
| 511115505002/01/19720238594281 | 358614 | 39 | 34584053 | 1 | 1 | 126.0 | 12.60 | 2019-01-02 07:20:23 | 7 | 2019-01-02 | 4 | FALSE | weekday |
| 511115505002/01/19720238594281 | 358614 | 39 | 34584263 | 1 | 1 | 137.5 | 13.75 | 2019-01-02 07:20:23 | 7 | 2019-01-02 | 4 | FALSE | weekday |
| 511115505002/01/19720238594281 | 358614 | 39 | 33370323 | 11 | 1 | 239.6 | 23.96 | 2019-01-02 07:20:23 | 7 | 2019-01-02 | 4 | FALSE | weekday |
Successivamente, si sceglie la durata del periodo di lookback prima della data di riferimento:
#selezione del periodo di lookback: 3 mesi
churn_recency <- churn_periodo_studio %>%
filter(DIREZIONE == 1) %>%
group_by(ID_CLI) %>%
summarise(LAST_PURCHASE_DATE = max(TIC_DATE))
churn_recency$RECENCY <- difftime(as.Date("01/01/2019",
format = "%d/%m/%Y"),
churn_recency$LAST_PURCHASE_DATE,
units = "days")
churn_frequency <- churn_periodo_studio %>%
filter(DIREZIONE == 1) %>%
group_by(ID_CLI) %>%
summarise(TOT_PURCHASE = n_distinct(ID_SCONTRINO)) %>%
arrange(desc(TOT_PURCHASE))
churn_monetary <- churn_periodo_studio %>%
filter(DIREZIONE == 1) %>%
group_by(ID_CLI) %>%
summarise(IMPORTO_LORDO = sum(IMPORTO_LORDO),
SCONTO = sum(SCONTO),
SPESA = IMPORTO_LORDO - SCONTO) %>%
ungroup() %>%
as.data.frame() %>%
arrange(desc(IMPORTO_LORDO))
churn <- merge(churn_recency, churn_frequency, by = "ID_CLI") #-- Merge finale
churn <- merge(churn, churn_monetary, by = "ID_CLI") %>%
select(ID_CLI,
RECENCY,
SPESA,
TOT_PURCHASE)
knitr::kable(head(churn))
| ID_CLI | RECENCY | SPESA | TOT_PURCHASE |
|---|---|---|---|
| 5 | 39 days | 188.28 | 1 |
| 18 | 39 days | 50.69 | 1 |
| 23 | 11 days | 1736.88 | 8 |
| 28 | 82 days | 1864.48 | 1 |
| 48 | 3 days | 2047.45 | 11 |
| 58 | 5 days | 39.94 | 1 |
#creazione della variabile target churn: 1 = churned nel periodo di holdout, 0 = altrimenti. Non si puĂ² utilizzare la relazione con la next purchase curve, perchè in data7_complete uno stesso cliente viene riconsiderato nel momento in cui ritorna all'acquisto, mentre in questo caso viene considerato una sola volta.
churn$CHURN <- 1
for (i in c(1:nrow(churn))){
if (churn$ID_CLI[i] %in% no_churner) churn$CHURN[i] <- 0
}
churn$CHURN <- as.factor(churn$CHURN)
table(churn$CHURN)
##
## 0 1
## 35373 59752
#si selezionano i predittori: RECENCY, SPESA, TOT_PURCHASE, REGION, LAST_COD_FID e TYP_JOB
churn <- left_join(churn, data2_complete[, c("ID_CLI", "TYP_JOB")], by = "ID_CLI")
churn <- left_join(churn, data1_complete[, c("ID_CLI", "LAST_COD_FID")], by = "ID_CLI")
regione <- left_join(data2_complete[, c("ID_CLI", "ID_ADDRESS")],
data3_complete[, c("ID_ADDRESS", "REGION")], by = "ID_ADDRESS")
churn <- left_join(churn, regione, by = "ID_CLI")
churn <- churn[, -8]
knitr::kable(head(churn))
| ID_CLI | RECENCY | SPESA | TOT_PURCHASE | CHURN | TYP_JOB | LAST_COD_FID | REGION |
|---|---|---|---|---|---|---|---|
| 5 | 39 days | 188.28 | 1 | 1 | (missing) | STANDARD BIZ | LOMBARDIA |
| 18 | 39 days | 50.69 | 1 | 1 | (missing) | STANDARD | LOMBARDIA |
| 23 | 11 days | 1736.88 | 8 | 0 | (missing) | STANDARD | LOMBARDIA |
| 28 | 82 days | 1864.48 | 1 | 1 | (missing) | PREMIUM BIZ | LOMBARDIA |
| 48 | 3 days | 2047.45 | 11 | 0 | Libero professionista | PREMIUM | LOMBARDIA |
| 58 | 5 days | 39.94 | 1 | 1 | (missing) | STANDARD | LOMBARDIA |
Si utilizzano dei modelli per cercare di prevedere se i clienti saranno o meno dei churn. Si provano 4 modelli: Tree, Random forest, Logistic regression e Lasso
#creazione di train (70%) e test (30%) necessario per addestrare e testare i modelli. Si evidenzia che la variabile target non risulta essere sbilanciata
churn <- na.omit(churn)
train_p <- createDataPartition(churn$CHURN,
p = .70,
list = FALSE,
times = 1)
train <- churn[train_p,]
test <- churn[-train_p,]
table(train$CHURN)
##
## 0 1
## 23972 39886
#Tree, la variabile TOT_PURCHASE è la variabile piĂ¹ significativa
tree <- rpart(CHURN ~ RECENCY + SPESA + TOT_PURCHASE + REGION + TYP_JOB,
data = train)
rpart.plot(tree, extra = "auto")
summary(tree)
## Call:
## rpart(formula = CHURN ~ RECENCY + SPESA + TOT_PURCHASE + REGION +
## TYP_JOB, data = train)
## n= 63858
##
## CP nsplit rel error xerror xstd
## 1 0.158268 0 1.000000 1.0000000 0.005104468
## 2 0.010000 1 0.841732 0.8542883 0.004920194
##
## Variable importance
## TOT_PURCHASE SPESA
## 89 11
##
## Node number 1: 63858 observations, complexity param=0.158268
## predicted class=1 expected loss=0.3753954 P(node) =1
## class counts: 23972 39886
## probabilities: 0.375 0.625
## left son=2 (13440 obs) right son=3 (50418 obs)
## Primary splits:
## TOT_PURCHASE < 3.5 to the right, improve=2404.40100, (0 missing)
## RECENCY < 25.5 to the left, improve=1783.54700, (0 missing)
## SPESA < 379.805 to the right, improve= 481.01080, (0 missing)
## TYP_JOB splits as LLLLLLRRLRLLLLR, improve= 70.30463, (0 missing)
## REGION splits as LRRLLLLLLRRLLRLLRLRL, improve= 66.00161, (0 missing)
## Surrogate splits:
## SPESA < 970.53 to the right, agree=0.815, adj=0.12, (0 split)
## RECENCY < 1.5 to the left, agree=0.790, adj=0.00, (0 split)
## TYP_JOB splits as RRRRRRRRRRRRLRR, agree=0.790, adj=0.00, (0 split)
##
## Node number 2: 13440 observations
## predicted class=0 expected loss=0.3588542 P(node) =0.210467
## class counts: 8617 4823
## probabilities: 0.641 0.359
##
## Node number 3: 50418 observations
## predicted class=1 expected loss=0.3045539 P(node) =0.789533
## class counts: 15355 35063
## probabilities: 0.305 0.695
printcp(tree)
##
## Classification tree:
## rpart(formula = CHURN ~ RECENCY + SPESA + TOT_PURCHASE + REGION +
## TYP_JOB, data = train)
##
## Variables actually used in tree construction:
## [1] TOT_PURCHASE
##
## Root node error: 23972/63858 = 0.3754
##
## n= 63858
##
## CP nsplit rel error xerror xstd
## 1 0.15827 0 1.00000 1.00000 0.0051045
## 2 0.01000 1 0.84173 0.85429 0.0049202
memory.limit(10000000)
## [1] 1e+07
tree_rf <- randomForest::randomForest(CHURN ~ RECENCY + SPESA + TOT_PURCHASE +
REGION + TYP_JOB,
data = train, ntree = 100)
print(tree_rf)
##
## Call:
## randomForest(formula = CHURN ~ RECENCY + SPESA + TOT_PURCHASE + REGION + TYP_JOB, data = train, ntree = 100)
## Type of random forest: classification
## Number of trees: 100
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 31.68%
## Confusion matrix:
## 0 1 class.error
## 0 9511 14461 0.6032455
## 1 5772 34114 0.1447124
#RECENCY e TOT_PURCHASE sono le variabili piĂ¹ significative
logistic = train(CHURN ~ RECENCY + SPESA + TOT_PURCHASE +
REGION + TYP_JOB,
data = train,
method = "glm")
summary(logistic)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4368 -1.1621 0.6800 0.9144 4.4599
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) 3.655e-01 1.632e-01 2.240
## RECENCY 1.635e-02 3.896e-04 41.967
## SPESA 1.231e-05 6.690e-06 1.839
## TOT_PURCHASE -2.210e-01 4.437e-03 -49.814
## REGIONBASILICATA 7.222e-01 1.960e-01 3.686
## REGIONCALABRIA 1.072e+00 2.424e-01 4.420
## REGIONCAMPANIA 1.027e-01 6.574e-02 1.563
## `REGIONEMILIA ROMAGNA` 4.751e-02 6.659e-02 0.714
## `REGIONFRIULI VENEZIA GIULIA` 1.205e-01 9.103e-02 1.324
## REGIONLAZIO 8.435e-02 6.399e-02 1.318
## REGIONLIGURIA 1.212e-01 8.085e-02 1.499
## REGIONLOMBARDIA 3.534e-02 6.027e-02 0.586
## REGIONMARCHE 1.263e+00 2.096e-01 6.023
## REGIONMOLISE 7.474e-01 3.154e-01 2.370
## REGIONPIEMONTE 7.428e-02 6.612e-02 1.124
## REGIONPUGLIA 1.648e-01 6.836e-02 2.411
## REGIONSARDEGNA 1.073e+00 2.923e-01 3.672
## REGIONSICILIA 1.597e-01 6.727e-02 2.374
## REGIONTOSCANA 3.023e-01 7.417e-02 4.076
## `REGIONTRENTINO ALTO ADIGE` 1.079e+00 2.811e-01 3.837
## REGIONUMBRIA 1.246e-01 8.844e-02 1.409
## `REGIONVALLE D'AOSTA` 5.428e-01 3.883e-01 1.398
## REGIONVENETO 8.153e-02 6.447e-02 1.265
## TYP_JOBArtigiano -4.589e-01 3.869e-01 -1.186
## TYP_JOBCasalinga -1.347e-01 4.180e-01 -0.322
## `TYP_JOBCommerciante/Esercente` -1.084e+00 4.726e-01 -2.293
## `TYP_JOBDirigente/Quadro/Funzionario` -7.144e-02 2.735e-01 -0.261
## `TYP_JOBImpiegato/a` -3.778e-01 1.809e-01 -2.088
## TYP_JOBImprenditore 2.209e-01 4.329e-01 0.510
## `TYP_JOBIn cerca di occupazione` -4.507e-02 5.713e-01 -0.079
## `TYP_JOBLibero professionista` -2.209e-01 1.659e-01 -1.331
## `TYP_JOBNon Dichiara` 4.109e-01 3.339e-01 1.231
## `TYP_JOBOperaio/a` -9.171e-01 2.518e-01 -3.642
## `TYP_JOBPensionato/a` -2.235e-01 2.321e-01 -0.963
## `TYP_JOBRappresentante/Agente Commerciale` -1.166e+00 1.331e+00 -0.876
## TYP_JOBStudente -2.859e-01 7.189e-01 -0.398
## `TYP_JOB(missing)` 6.472e-02 1.521e-01 0.426
## Pr(>|z|)
## (Intercept) 0.025113 *
## RECENCY < 2e-16 ***
## SPESA 0.065863 .
## TOT_PURCHASE < 2e-16 ***
## REGIONBASILICATA 0.000228 ***
## REGIONCALABRIA 9.87e-06 ***
## REGIONCAMPANIA 0.118116
## `REGIONEMILIA ROMAGNA` 0.475533
## `REGIONFRIULI VENEZIA GIULIA` 0.185564
## REGIONLAZIO 0.187424
## REGIONLIGURIA 0.133850
## REGIONLOMBARDIA 0.557597
## REGIONMARCHE 1.71e-09 ***
## REGIONMOLISE 0.017791 *
## REGIONPIEMONTE 0.261223
## REGIONPUGLIA 0.015926 *
## REGIONSARDEGNA 0.000241 ***
## REGIONSICILIA 0.017620 *
## REGIONTOSCANA 4.59e-05 ***
## `REGIONTRENTINO ALTO ADIGE` 0.000124 ***
## REGIONUMBRIA 0.158699
## `REGIONVALLE D'AOSTA` 0.162092
## REGIONVENETO 0.206004
## TYP_JOBArtigiano 0.235549
## TYP_JOBCasalinga 0.747294
## `TYP_JOBCommerciante/Esercente` 0.021854 *
## `TYP_JOBDirigente/Quadro/Funzionario` 0.793911
## `TYP_JOBImpiegato/a` 0.036772 *
## TYP_JOBImprenditore 0.609903
## `TYP_JOBIn cerca di occupazione` 0.937122
## `TYP_JOBLibero professionista` 0.183101
## `TYP_JOBNon Dichiara` 0.218442
## `TYP_JOBOperaio/a` 0.000270 ***
## `TYP_JOBPensionato/a` 0.335605
## `TYP_JOBRappresentante/Agente Commerciale` 0.381019
## TYP_JOBStudente 0.690867
## `TYP_JOB(missing)` 0.670439
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 84518 on 63857 degrees of freedom
## Residual deviance: 75926 on 63821 degrees of freedom
## AIC: 76000
##
## Number of Fisher Scoring iterations: 4
#Lasso
lasso = train(CHURN ~ RECENCY + SPESA + TOT_PURCHASE +
REGION + TYP_JOB,
data = train,
method = "glmnet",
family ="binomial")
lasso
## glmnet
##
## 63858 samples
## 5 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 63858, 63858, 63858, 63858, 63858, 63858, ...
## Resampling results across tuning parameters:
##
## alpha lambda Accuracy Kappa
## 0.10 0.0002840866 0.6932920 0.2747197
## 0.10 0.0028408662 0.6932988 0.2746677
## 0.10 0.0284086619 0.6919673 0.2634784
## 0.55 0.0002840866 0.6934521 0.2754803
## 0.55 0.0028408662 0.6934877 0.2745276
## 0.55 0.0284086619 0.6891087 0.2490805
## 1.00 0.0002840866 0.6934385 0.2755592
## 1.00 0.0028408662 0.6934211 0.2738142
## 1.00 0.0284086619 0.6865659 0.2367597
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were alpha = 0.55 and lambda = 0.002840866.
plot(lasso)
Analisi delle prediction per ogni modello
#si valuta accuracy e tutte le matriche di predizione dei modelli che sono stati addestrati
pred_tree <- predict(tree, test[,-5], type = "class")
p1 <- unlist(pred_tree)
confusionMatrix(p1, test$CHURN)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 3675 2154
## 1 6598 14939
##
## Accuracy : 0.6802
## 95% CI : (0.6746, 0.6857)
## No Information Rate : 0.6246
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.2536
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.3577
## Specificity : 0.8740
## Pos Pred Value : 0.6305
## Neg Pred Value : 0.6936
## Prevalence : 0.3754
## Detection Rate : 0.1343
## Detection Prevalence : 0.2130
## Balanced Accuracy : 0.6159
##
## 'Positive' Class : 0
##
pred_rf <- predict(tree_rf, test[,-5], type = "class")
confusionMatrix(pred_rf, test$CHURN)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 4031 2423
## 1 6242 14670
##
## Accuracy : 0.6834
## 95% CI : (0.6778, 0.6889)
## No Information Rate : 0.6246
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.2707
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.3924
## Specificity : 0.8582
## Pos Pred Value : 0.6246
## Neg Pred Value : 0.7015
## Prevalence : 0.3754
## Detection Rate : 0.1473
## Detection Prevalence : 0.2358
## Balanced Accuracy : 0.6253
##
## 'Positive' Class : 0
##
pred_log <- predict(logistic, test[,-5], type= "raw")
confusionMatrix(pred_log, test$CHURN)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 3523 1709
## 1 6750 15384
##
## Accuracy : 0.6909
## 95% CI : (0.6854, 0.6964)
## No Information Rate : 0.6246
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.2693
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.3429
## Specificity : 0.9000
## Pos Pred Value : 0.6734
## Neg Pred Value : 0.6950
## Prevalence : 0.3754
## Detection Rate : 0.1287
## Detection Prevalence : 0.1912
## Balanced Accuracy : 0.6215
##
## 'Positive' Class : 0
##
pred_lasso <- predict(lasso, test[,-5], type = "raw")
confusionMatrix(pred_lasso, test$CHURN)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 3452 1657
## 1 6821 15436
##
## Accuracy : 0.6902
## 95% CI : (0.6847, 0.6957)
## No Information Rate : 0.6246
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.2657
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.3360
## Specificity : 0.9031
## Pos Pred Value : 0.6757
## Neg Pred Value : 0.6935
## Prevalence : 0.3754
## Detection Rate : 0.1261
## Detection Prevalence : 0.1867
## Balanced Accuracy : 0.6195
##
## 'Positive' Class : 0
##
Calcolo di curve gain e lift per ogni modello
#Calcolo probabilitĂ di curve gain e lift e rappresentazione mediante grafici, tree modello peggiore rispetto ai 4
prob_tree = predict(tree, test[,-5], "prob")[,1]
prob_rf = predict(tree_rf, test[,-5], "prob")[,1]
prob_log = predict(logistic, test[,-5], "prob")[,1]
prob_lasso = predict(lasso, test[,-5], "prob")[,1]
d_class = as.data.frame(cbind(prob_tree, prob_rf, prob_log, prob_lasso))
d_class = cbind(d_class, test$CHURN)
colnames(d_class) <- c("prob_tree", "prob_rf", "prob_log", "prob_lasso", "churn")
head(d_class)
## prob_tree prob_rf prob_log prob_lasso churn
## 11 0.3045539 0.18 0.3513166 0.3496427 1
## 12 0.3045539 0.13 0.2001193 0.2013677 1
## 15 0.3045539 0.55 0.3534518 0.3615758 1
## 16 0.3045539 0.07 0.3240113 0.3323843 1
## 17 0.3045539 0.04 0.2374554 0.2391976 1
## 23 0.3045539 0.07 0.2670007 0.2753297 0
lift_tree = gain_lift(data = d_class, score = 'prob_tree', target = 'churn')
## Population Gain Lift Score.Point
## 1 10 35.77 3.58 0.6411458
## 2 20 35.77 1.79 0.6411458
## 3 30 100.00 3.33 0.3045539
## 4 40 100.00 2.50 0.3045539
## 5 50 100.00 2.00 0.3045539
## 6 60 100.00 1.67 0.3045539
## 7 70 100.00 1.43 0.3045539
## 8 80 100.00 1.25 0.3045539
## 9 90 100.00 1.11 0.3045539
## 10 100 100.00 1.00 0.3045539
lift_rf = gain_lift(data = d_class, score = 'prob_rf', target = 'churn')
## Population Gain Lift Score.Point
## 1 10 19.74 1.97 0.74
## 2 20 34.70 1.74 0.56
## 3 30 47.34 1.58 0.41
## 4 40 57.31 1.43 0.29
## 5 50 67.30 1.35 0.20
## 6 60 75.56 1.26 0.14
## 7 70 82.35 1.18 0.10
## 8 80 90.12 1.13 0.06
## 9 90 95.82 1.06 0.03
## 10 100 100.00 1.00 0.00
lift_log = gain_lift(data = d_class, score = 'prob_log', target = 'churn')
## Population Gain Lift Score.Point
## 1 10 19.90 1.99 0.61693588
## 2 20 35.38 1.77 0.49056291
## 3 30 48.51 1.62 0.42561676
## 4 40 59.01 1.48 0.38361604
## 5 50 68.12 1.36 0.34270890
## 6 60 75.62 1.26 0.30109781
## 7 70 83.00 1.19 0.26437609
## 8 80 89.28 1.12 0.22403495
## 9 90 94.84 1.05 0.18236461
## 10 100 100.00 1.00 0.04927497
lift_lasso = gain_lift(data = d_class, score = 'prob_lasso', target = 'churn')
## Population Gain Lift Score.Point
## 1 10 19.84 1.98 0.61034711
## 2 20 35.40 1.77 0.48726157
## 3 30 48.53 1.62 0.42384815
## 4 40 59.06 1.48 0.38365000
## 5 50 68.08 1.36 0.34396966
## 6 60 75.76 1.26 0.30359914
## 7 70 82.98 1.19 0.26614535
## 8 80 89.21 1.12 0.22648916
## 9 90 94.87 1.05 0.18637853
## 10 100 100.00 1.00 0.06264479
Risultati dei modelli su clienti di cui non si conosce ancora se saranno churn o meno
#si applicano tutte i passaggi effettuati in precedenza con la differenza che la data di riferimento sarĂ il 01/01/2019 ed il periodo di scoring andrĂ dal 01/03/2019 al 30/04/2019
churn_periodo_studio_pred <- data7_complete %>%
filter(DIREZIONE == 1,
TIC_DATE < as.Date("30/04/2019",
format = "%d/%m/%Y"),
TIC_DATE > as.Date("01/03/2019",
format = "%d/%m/%Y"))
churn_recency_pred <- churn_periodo_studio_pred %>%
filter(DIREZIONE == 1) %>%
group_by(ID_CLI) %>%
summarise(LAST_PURCHASE_DATE = max(TIC_DATE))
churn_recency_pred$RECENCY <- difftime(as.Date("30/04/2019",
format = "%d/%m/%Y"),
churn_recency_pred$LAST_PURCHASE_DATE,
units = "days")
churn_frequency_pred <- churn_periodo_studio_pred %>%
filter(DIREZIONE == 1) %>%
group_by(ID_CLI) %>%
summarise(TOT_PURCHASE = n_distinct(ID_SCONTRINO)) %>%
arrange(desc(TOT_PURCHASE))
churn_monetary_pred <-churn_periodo_studio_pred %>%
filter(DIREZIONE == 1) %>%
group_by(ID_CLI) %>%
summarise(IMPORTO_LORDO = sum(IMPORTO_LORDO),
SCONTO = sum(SCONTO),
SPESA = IMPORTO_LORDO - SCONTO) %>%
ungroup() %>%
as.data.frame() %>%
arrange(desc(IMPORTO_LORDO))
churn_pred <- merge(churn_recency_pred, churn_frequency_pred, by = "ID_CLI")
churn_pred <- merge(churn_pred, churn_monetary_pred, by = "ID_CLI") %>%
select(ID_CLI,
RECENCY,
SPESA,
TOT_PURCHASE)
knitr::kable(head(churn_pred))
| ID_CLI | RECENCY | SPESA | TOT_PURCHASE |
|---|---|---|---|
| 32 | 28 days | 903.61 | 5 |
| 33 | 59 days | 48.45 | 1 |
| 48 | 4 days | 1350.99 | 9 |
| 56 | 50 days | 38.74 | 1 |
| 76 | 53 days | 224.85 | 1 |
| 77 | 3 days | 1200.04 | 8 |
churn_pred <- left_join(churn_pred, data2_complete[, c("ID_CLI", "TYP_JOB")], by = "ID_CLI")
churn_pred <- left_join(churn_pred, data1_complete[, c("ID_CLI", "LAST_COD_FID")], by = "ID_CLI")
regione <- left_join(data2_complete[, c("ID_CLI", "ID_ADDRESS")],
data3_complete[, c("ID_ADDRESS", "REGION")], by = "ID_ADDRESS")
churn_pred <- left_join(churn_pred, regione, by = "ID_CLI")
churn_pred <- churn_pred[, -7]
knitr::kable(head(churn_pred))
| ID_CLI | RECENCY | SPESA | TOT_PURCHASE | TYP_JOB | LAST_COD_FID | REGION |
|---|---|---|---|---|---|---|
| 32 | 28 days | 903.61 | 5 | (missing) | PREMIUM BIZ | LOMBARDIA |
| 33 | 59 days | 48.45 | 1 | (missing) | PREMIUM | LOMBARDIA |
| 48 | 4 days | 1350.99 | 9 | Libero professionista | PREMIUM | LOMBARDIA |
| 56 | 50 days | 38.74 | 1 | (missing) | STANDARD BIZ | LOMBARDIA |
| 76 | 53 days | 224.85 | 1 | (missing) | STANDARD | MARCHE |
| 77 | 3 days | 1200.04 | 8 | (missing) | PREMIUM | LOMBARDIA |
churn_pred <- na.omit(churn_pred)
churn_pred$prob_to_churn <- predict(logistic, churn_pred, type = "prob")[,2]
knitr::kable(head(churn_pred))
| ID_CLI | RECENCY | SPESA | TOT_PURCHASE | TYP_JOB | LAST_COD_FID | REGION | prob_to_churn |
|---|---|---|---|---|---|---|---|
| 32 | 28 days | 903.61 | 5 | (missing) | PREMIUM BIZ | LOMBARDIA | 0.4574334 |
| 33 | 59 days | 48.45 | 1 | (missing) | PREMIUM | LOMBARDIA | 0.7702695 |
| 48 | 4 days | 1350.99 | 9 | Libero professionista | PREMIUM | LOMBARDIA | 0.1509283 |
| 56 | 50 days | 38.74 | 1 | (missing) | STANDARD BIZ | LOMBARDIA | 0.7431783 |
| 76 | 53 days | 224.85 | 1 | (missing) | STANDARD | MARCHE | 0.9122222 |
| 77 | 3 days | 1200.04 | 8 | (missing) | PREMIUM | LOMBARDIA | 0.2246235 |